File: squeeze.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./squeeze ./squeeze.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 36 #ifdef _WIN32 37 #include <fcntl.h> 38 #include <windows.h> 39 #endif 40 41 #ifdef RED_ERRORS 42 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 43 #ifdef __APPLE__ 44 #define ERROR_STYLE "\x1b[31m" 45 #endif 46 #define RESET_STYLE "\x1b[0m" 47 #else 48 #define ERROR_STYLE 49 #define RESET_STYLE 50 #endif 51 52 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 53 54 const char* info = "" 55 "squeeze [filenames...]\n" 56 "\n" 57 "Ignore leading/trailing spaces (and carriage-returns) on lines, also turning\n" 58 "all runs of multiple consecutive spaces into single spaces. Spaces around\n" 59 "tabs are ignored as well.\n" 60 ""; 61 62 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at 63 // stdout, it considerably speeds up this app, as intended 64 typedef struct bufwriter { 65 // buf is the buffer proper 66 unsigned char* buf; 67 68 // len is how many bytes of the buffer are currently being used 69 size_t len; 70 71 // cap is the capacity of the buffer, or the most bytes it can hold 72 size_t cap; 73 74 // out is the destination of all that's written into the buffer 75 FILE* out; 76 } bufwriter; 77 78 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) { 79 w->buf = b; 80 w->len = 0; 81 w->cap = cap; 82 w->out = out; 83 } 84 85 void write_byte(bufwriter* w, unsigned char b) { 86 if (w->len < w->cap) { 87 w->buf[w->len++] = b; 88 return; 89 } 90 91 fwrite(w->buf, w->cap, 1, w->out); 92 w->buf[0] = b; 93 w->len = 1; 94 } 95 96 void flush(bufwriter* w) { 97 if (w->len > 0) { 98 fwrite(w->buf, w->len, 1, w->out); 99 } 100 w->len = 0; 101 fflush(w->out); 102 } 103 104 // slice is a growable region of bytes in memory 105 typedef struct slice { 106 // ptr is the starting place of the region 107 unsigned char* ptr; 108 109 // len is how many bytes are currently being used 110 size_t len; 111 112 // cap is how many bytes the memory region has available 113 size_t cap; 114 } slice; 115 116 bool starts_with_bom(const unsigned char* b, const size_t n) { 117 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 118 } 119 120 void write_squeezed_line(bufwriter* w, const unsigned char* p, size_t len) { 121 // ignore trailing CRLF byte-pairs, or trailing line-feed bytes 122 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 123 len -= 2; 124 } else if (len >= 1 && p[len - 1] == '\n') { 125 len--; 126 } 127 128 // ignore leading spaces 129 while (len > 0 && p[0] == ' ') { 130 p++; 131 len--; 132 } 133 134 // ignore trailing spaces 135 // while (len > 0 && p[len - 1] == ' ') { 136 // len--; 137 // } 138 139 // trailing spaces won't be emitted, since there's nothing to follow 140 // them, which in turn prevents their normal single-space replacement 141 // from being emitted 142 143 bool space = false; 144 145 for (size_t i = 0; i < len; i++) { 146 unsigned char b = p[i]; 147 148 if (b == ' ') { 149 space = true; 150 continue; 151 } 152 153 if (b == '\t') { 154 space = false; 155 write_byte(w, '\t'); 156 157 // ignore spaces right after tabs 158 while (i + 1 < len && p[i + 1] == ' ') { 159 i++; 160 } 161 162 continue; 163 } 164 165 if (space) { 166 write_byte(w, ' '); 167 space = false; 168 } 169 write_byte(w, b); 170 } 171 172 write_byte(w, '\n'); 173 flush(w); 174 } 175 176 // handle_reader skips leading UTF-8 BOMs (byte-order marks), and turns all 177 // CR-LF pairs into single LF bytes 178 bool handle_reader(bufwriter* w, FILE* r, slice* line) { 179 for (size_t i = 0; !feof(w->out); i++) { 180 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 181 if (len < 0) { 182 break; 183 } 184 185 if (line->ptr == NULL) { 186 fprintf(stderr, ERROR_LINE("out of memory")); 187 return false; 188 } 189 190 // line->len = len; 191 192 // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it 193 if (i == 0 && starts_with_bom(line->ptr, len)) { 194 write_squeezed_line(w, line->ptr + 3, len - 3); 195 continue; 196 } 197 198 write_squeezed_line(w, line->ptr, len); 199 } 200 201 flush(w); 202 return true; 203 } 204 205 // handle_file handles data from the filename given; returns false only when 206 // the file can't be opened 207 bool handle_file(bufwriter* w, const char* path, slice* line) { 208 FILE* f = fopen(path, "rb"); 209 if (f == NULL) { 210 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 211 return false; 212 } 213 214 const bool ok = handle_reader(w, f, line); 215 fclose(f); 216 return ok; 217 } 218 219 // run returns the number of errors 220 int run(int argc, char** argv, FILE* w) { 221 unsigned char outbuf[8 * 1024]; 222 bufwriter bw; 223 224 size_t dashes = 0; 225 for (int i = 1; i < argc; i++) { 226 if (argv[i][0] == '-' && argv[i][1] == 0) { 227 dashes++; 228 } 229 } 230 231 if (dashes > 1) { 232 const char* m = "can't use the standard input (dash) more than once"; 233 fprintf(stderr, ERROR_LINE("%s"), m); 234 return 1; 235 } 236 237 slice line; 238 line.len = 0; 239 line.cap = 32 * 1024; 240 line.ptr = malloc(line.cap); 241 242 if (line.ptr == NULL) { 243 fprintf(stderr, ERROR_LINE("out of memory")); 244 return 1; 245 } 246 247 init_bufwriter(&bw, w, outbuf, sizeof(outbuf)); 248 249 size_t errors = 0; 250 for (int i = 1; i < argc && !feof(stdout) && line.ptr != NULL; i++) { 251 if (argv[i][0] == '-' && argv[i][1] == 0) { 252 if (!handle_reader(&bw, stdin, &line)) { 253 errors++; 254 } 255 continue; 256 } 257 258 if (!handle_file(&bw, argv[i], &line)) { 259 errors++; 260 } 261 } 262 263 // use stdin when not given any filepaths 264 if (argc <= 1) { 265 if (!handle_reader(&bw, stdin, &line)) { 266 errors++; 267 } 268 } 269 270 free(line.ptr); 271 flush(&bw); 272 return errors; 273 } 274 275 int main(int argc, char** argv) { 276 #ifdef _WIN32 277 setmode(fileno(stdin), O_BINARY); 278 // ensure output lines end in LF instead of CRLF on windows 279 setmode(fileno(stdout), O_BINARY); 280 setmode(fileno(stderr), O_BINARY); 281 #endif 282 283 if (argc > 1) { 284 if ( 285 strcmp(argv[1], "-h") == 0 || 286 strcmp(argv[1], "-help") == 0 || 287 strcmp(argv[1], "--h") == 0 || 288 strcmp(argv[1], "--help") == 0 289 ) { 290 fprintf(stdout, "%s", info); 291 return 0; 292 } 293 } 294 295 return run(argc, argv, stdout) == 0 ? 0 : 1; 296 }