File: squeeze.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./squeeze ./squeeze.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #include <unistd.h>
  36 
  37 #ifdef _WIN32
  38 #include <fcntl.h>
  39 #include <windows.h>
  40 #endif
  41 
  42 #ifdef RED_ERRORS
  43 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  44 #ifdef __APPLE__
  45 #define ERROR_STYLE "\x1b[31m"
  46 #endif
  47 #define RESET_STYLE "\x1b[0m"
  48 #else
  49 #define ERROR_STYLE
  50 #define RESET_STYLE
  51 #endif
  52 
  53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  54 
  55 #define BAD_ALLOC 2
  56 
  57 #ifndef OBUF_SIZE
  58 #define OBUF_SIZE (8 * 1024)
  59 #endif
  60 
  61 const char* info = ""
  62 "squeeze [filenames...]\n"
  63 "\n"
  64 "Ignore leading/trailing spaces (and carriage-returns) on lines, also turning\n"
  65 "all runs of multiple consecutive spaces into single spaces. Spaces around\n"
  66 "tabs are ignored as well.\n"
  67 "";
  68 
  69 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at
  70 // stdout, it considerably speeds up this app, as intended
  71 typedef struct bufwriter {
  72     // buf is the buffer proper
  73     unsigned char* buf;
  74 
  75     // len is how many bytes of the buffer are currently being used
  76     size_t len;
  77 
  78     // cap is the capacity of the buffer, or the most bytes it can hold
  79     size_t cap;
  80 
  81     // out is the destination of all that's written into the buffer
  82     FILE* out;
  83 } bufwriter;
  84 
  85 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) {
  86     w->buf = b;
  87     w->len = 0;
  88     w->cap = cap;
  89     w->out = out;
  90 }
  91 
  92 void write_byte(bufwriter* w, unsigned char b) {
  93     if (w->len < w->cap) {
  94         w->buf[w->len++] = b;
  95         return;
  96     }
  97 
  98     fwrite(w->buf, 1, w->cap, w->out);
  99     w->buf[0] = b;
 100     w->len = 1;
 101 }
 102 
 103 void flush(bufwriter* w) {
 104     if (w->len > 0) {
 105         fwrite(w->buf, 1, w->len, w->out);
 106     }
 107     w->len = 0;
 108     fflush(w->out);
 109 }
 110 
 111 // slice is a growable region of bytes in memory
 112 typedef struct slice {
 113     // ptr is the starting place of the region
 114     unsigned char* ptr;
 115 
 116     // cap is how many bytes the memory region has available
 117     size_t cap;
 118 } slice;
 119 
 120 bool starts_with_bom(const unsigned char* b, const size_t n) {
 121     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 122 }
 123 
 124 void write_squeezed_line(bufwriter* w, const unsigned char* p, size_t len) {
 125     // ignore trailing CRLF byte-pairs, or trailing line-feed bytes
 126     if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 127         len -= 2;
 128     } else if (len >= 1 && p[len - 1] == '\n') {
 129         len--;
 130     }
 131 
 132     // ignore leading spaces
 133     while (len > 0 && p[0] == ' ') {
 134         p++;
 135         len--;
 136     }
 137 
 138     // ignore trailing spaces
 139     // while (len > 0 && p[len - 1] == ' ') {
 140     //     len--;
 141     // }
 142 
 143     // trailing spaces won't be emitted, since there's nothing to follow
 144     // them, which in turn prevents their normal single-space replacement
 145     // from being emitted
 146 
 147     bool space = false;
 148 
 149     for (size_t i = 0; i < len; i++) {
 150         unsigned char b = p[i];
 151 
 152         if (b == ' ') {
 153             space = true;
 154             continue;
 155         }
 156 
 157         if (b == '\t') {
 158             space = false;
 159             write_byte(w, '\t');
 160 
 161             // ignore spaces right after tabs
 162             while (i + 1 < len && p[i + 1] == ' ') {
 163                 i++;
 164             }
 165 
 166             continue;
 167         }
 168 
 169         if (space) {
 170             write_byte(w, ' ');
 171             space = false;
 172         }
 173         write_byte(w, b);
 174     }
 175 
 176     write_byte(w, '\n');
 177 }
 178 
 179 void handle_reader(bufwriter* w, FILE* r, slice* line, bool live_lines) {
 180     for (size_t i = 0; !feof(w->out); i++) {
 181         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 182         if (len < 0) {
 183             break;
 184         }
 185 
 186         if (line->ptr == NULL) {
 187             fprintf(stderr, "\n");
 188             fprintf(stderr, ERROR_LINE("out of memory"));
 189             exit(BAD_ALLOC);
 190         }
 191 
 192         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 193         if (i == 0 && starts_with_bom(line->ptr, len)) {
 194             write_squeezed_line(w, line->ptr + 3, len - 3);
 195             continue;
 196         }
 197 
 198         write_squeezed_line(w, line->ptr, len);
 199         if (live_lines) {
 200             flush(w);
 201         }
 202     }
 203 
 204     if (!live_lines) {
 205         flush(w);
 206     }
 207 }
 208 
 209 // handle_file handles data from the filename given; returns false only when
 210 // the file can't be opened
 211 bool handle_file(bufwriter* w, const char* path, slice* line, bool live_lines) {
 212     FILE* f = fopen(path, "rb");
 213     if (f == NULL) {
 214         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 215         return false;
 216     }
 217 
 218     handle_reader(w, f, line, live_lines);
 219     fclose(f);
 220     return true;
 221 }
 222 
 223 // run returns the number of errors
 224 int run(int argc, char** argv, FILE* w, bool live_lines) {
 225     unsigned char outbuf[OBUF_SIZE];
 226     bufwriter bw;
 227 
 228     size_t dashes = 0;
 229     for (int i = 1; i < argc; i++) {
 230         if (argv[i][0] == '-' && argv[i][1] == 0) {
 231             dashes++;
 232         }
 233     }
 234 
 235     if (dashes > 1) {
 236         const char* m = "can't use the standard input (dash) more than once";
 237         fprintf(stderr, ERROR_LINE("%s"), m);
 238         return 1;
 239     }
 240 
 241     slice line;
 242     line.cap = 32 * 1024;
 243     line.ptr = malloc(line.cap);
 244 
 245     if (line.ptr == NULL) {
 246         fprintf(stderr, ERROR_LINE("out of memory"));
 247         exit(BAD_ALLOC);
 248     }
 249 
 250     init_bufwriter(&bw, w, outbuf, sizeof(outbuf));
 251 
 252     size_t errors = 0;
 253     for (int i = 1; i < argc && !feof(w); i++) {
 254         if (argv[i][0] == '-' && argv[i][1] == 0) {
 255             handle_reader(&bw, stdin, &line, live_lines);
 256             continue;
 257         }
 258 
 259         if (!handle_file(&bw, argv[i], &line, live_lines)) {
 260             errors++;
 261         }
 262     }
 263 
 264     // use stdin when not given any filepaths
 265     if (argc <= 1) {
 266         handle_reader(&bw, stdin, &line, live_lines);
 267     }
 268 
 269     free(line.ptr);
 270     flush(&bw);
 271     return errors;
 272 }
 273 
 274 int main(int argc, char** argv) {
 275 #ifdef _WIN32
 276     setmode(fileno(stdin), O_BINARY);
 277     // ensure output lines end in LF instead of CRLF on windows
 278     setmode(fileno(stdout), O_BINARY);
 279     setmode(fileno(stderr), O_BINARY);
 280 #endif
 281 
 282     if (argc > 1) {
 283         if (
 284             strcmp(argv[1], "-h") == 0 ||
 285             strcmp(argv[1], "-help") == 0 ||
 286             strcmp(argv[1], "--h") == 0 ||
 287             strcmp(argv[1], "--help") == 0
 288         ) {
 289             fprintf(stdout, "%s", info);
 290             return 0;
 291         }
 292     }
 293 
 294     const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0;
 295     if (!live_lines) {
 296         setvbuf(stdout, NULL, _IOFBF, 0);
 297     }
 298     return run(argc, argv, stdout, live_lines) == 0 ? 0 : 1;
 299 }