File: squeeze.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./squeeze ./squeeze.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <unistd.h> 36 37 #ifdef _WIN32 38 #include <fcntl.h> 39 #include <windows.h> 40 #endif 41 42 #ifdef RED_ERRORS 43 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 44 #ifdef __APPLE__ 45 #define ERROR_STYLE "\x1b[31m" 46 #endif 47 #define RESET_STYLE "\x1b[0m" 48 #else 49 #define ERROR_STYLE 50 #define RESET_STYLE 51 #endif 52 53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 54 55 #define BAD_ALLOC 2 56 57 #ifndef OBUF_SIZE 58 #define OBUF_SIZE (8 * 1024) 59 #endif 60 61 const char* info = "" 62 "squeeze [filenames...]\n" 63 "\n" 64 "Ignore leading/trailing spaces (and carriage-returns) on lines, also turning\n" 65 "all runs of multiple consecutive spaces into single spaces. Spaces around\n" 66 "tabs are ignored as well.\n" 67 ""; 68 69 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at 70 // stdout, it considerably speeds up this app, as intended 71 typedef struct bufwriter { 72 // buf is the buffer proper 73 unsigned char* buf; 74 75 // len is how many bytes of the buffer are currently being used 76 size_t len; 77 78 // cap is the capacity of the buffer, or the most bytes it can hold 79 size_t cap; 80 81 // out is the destination of all that's written into the buffer 82 FILE* out; 83 } bufwriter; 84 85 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) { 86 w->buf = b; 87 w->len = 0; 88 w->cap = cap; 89 w->out = out; 90 } 91 92 void write_byte(bufwriter* w, unsigned char b) { 93 if (w->len < w->cap) { 94 w->buf[w->len++] = b; 95 return; 96 } 97 98 fwrite(w->buf, 1, w->cap, w->out); 99 w->buf[0] = b; 100 w->len = 1; 101 } 102 103 void flush(bufwriter* w) { 104 if (w->len > 0) { 105 fwrite(w->buf, 1, w->len, w->out); 106 } 107 w->len = 0; 108 fflush(w->out); 109 } 110 111 // slice is a growable region of bytes in memory 112 typedef struct slice { 113 // ptr is the starting place of the region 114 unsigned char* ptr; 115 116 // cap is how many bytes the memory region has available 117 size_t cap; 118 } slice; 119 120 bool starts_with_bom(const unsigned char* b, const size_t n) { 121 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 122 } 123 124 void write_squeezed_line(bufwriter* w, const unsigned char* p, size_t len) { 125 // ignore trailing CRLF byte-pairs, or trailing line-feed bytes 126 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 127 len -= 2; 128 } else if (len >= 1 && p[len - 1] == '\n') { 129 len--; 130 } 131 132 // ignore leading spaces 133 while (len > 0 && p[0] == ' ') { 134 p++; 135 len--; 136 } 137 138 // ignore trailing spaces 139 // while (len > 0 && p[len - 1] == ' ') { 140 // len--; 141 // } 142 143 // trailing spaces won't be emitted, since there's nothing to follow 144 // them, which in turn prevents their normal single-space replacement 145 // from being emitted 146 147 bool space = false; 148 149 for (size_t i = 0; i < len; i++) { 150 unsigned char b = p[i]; 151 152 if (b == ' ') { 153 space = true; 154 continue; 155 } 156 157 if (b == '\t') { 158 space = false; 159 write_byte(w, '\t'); 160 161 // ignore spaces right after tabs 162 while (i + 1 < len && p[i + 1] == ' ') { 163 i++; 164 } 165 166 continue; 167 } 168 169 if (space) { 170 write_byte(w, ' '); 171 space = false; 172 } 173 write_byte(w, b); 174 } 175 176 write_byte(w, '\n'); 177 } 178 179 void handle_reader(bufwriter* w, FILE* r, slice* line, bool live_lines) { 180 for (size_t i = 0; !feof(w->out); i++) { 181 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 182 if (len < 0) { 183 break; 184 } 185 186 if (line->ptr == NULL) { 187 fprintf(stderr, "\n"); 188 fprintf(stderr, ERROR_LINE("out of memory")); 189 exit(BAD_ALLOC); 190 } 191 192 // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it 193 if (i == 0 && starts_with_bom(line->ptr, len)) { 194 write_squeezed_line(w, line->ptr + 3, len - 3); 195 continue; 196 } 197 198 write_squeezed_line(w, line->ptr, len); 199 if (live_lines) { 200 flush(w); 201 } 202 } 203 204 if (!live_lines) { 205 flush(w); 206 } 207 } 208 209 // handle_file handles data from the filename given; returns false only when 210 // the file can't be opened 211 bool handle_file(bufwriter* w, const char* path, slice* line, bool live_lines) { 212 FILE* f = fopen(path, "rb"); 213 if (f == NULL) { 214 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 215 return false; 216 } 217 218 handle_reader(w, f, line, live_lines); 219 fclose(f); 220 return true; 221 } 222 223 // run returns the number of errors 224 int run(int argc, char** argv, FILE* w, bool live_lines) { 225 unsigned char outbuf[OBUF_SIZE]; 226 bufwriter bw; 227 228 size_t dashes = 0; 229 for (int i = 1; i < argc; i++) { 230 if (argv[i][0] == '-' && argv[i][1] == 0) { 231 dashes++; 232 } 233 } 234 235 if (dashes > 1) { 236 const char* m = "can't use the standard input (dash) more than once"; 237 fprintf(stderr, ERROR_LINE("%s"), m); 238 return 1; 239 } 240 241 slice line; 242 line.cap = 32 * 1024; 243 line.ptr = malloc(line.cap); 244 245 if (line.ptr == NULL) { 246 fprintf(stderr, ERROR_LINE("out of memory")); 247 exit(BAD_ALLOC); 248 } 249 250 init_bufwriter(&bw, w, outbuf, sizeof(outbuf)); 251 252 size_t errors = 0; 253 for (int i = 1; i < argc && !feof(w); i++) { 254 if (argv[i][0] == '-' && argv[i][1] == 0) { 255 handle_reader(&bw, stdin, &line, live_lines); 256 continue; 257 } 258 259 if (!handle_file(&bw, argv[i], &line, live_lines)) { 260 errors++; 261 } 262 } 263 264 // use stdin when not given any filepaths 265 if (argc <= 1) { 266 handle_reader(&bw, stdin, &line, live_lines); 267 } 268 269 free(line.ptr); 270 flush(&bw); 271 return errors; 272 } 273 274 int main(int argc, char** argv) { 275 #ifdef _WIN32 276 setmode(fileno(stdin), O_BINARY); 277 // ensure output lines end in LF instead of CRLF on windows 278 setmode(fileno(stdout), O_BINARY); 279 setmode(fileno(stderr), O_BINARY); 280 #endif 281 282 if (argc > 1) { 283 if ( 284 strcmp(argv[1], "-h") == 0 || 285 strcmp(argv[1], "-help") == 0 || 286 strcmp(argv[1], "--h") == 0 || 287 strcmp(argv[1], "--help") == 0 288 ) { 289 fprintf(stdout, "%s", info); 290 return 0; 291 } 292 } 293 294 const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0; 295 if (!live_lines) { 296 setvbuf(stdout, NULL, _IOFBF, 0); 297 } 298 return run(argc, argv, stdout, live_lines) == 0 ? 0 : 1; 299 }