File: dessv.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O2 -march=native -mtune=native -flto -o ./dessv ./dessv.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <unistd.h> 37 38 #ifdef _WIN32 39 #include <fcntl.h> 40 #include <windows.h> 41 #endif 42 43 #ifdef RED_ERRORS 44 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 45 #ifdef __APPLE__ 46 #define ERROR_STYLE "\x1b[31m" 47 #endif 48 #define RESET_STYLE "\x1b[0m" 49 #else 50 #define ERROR_STYLE 51 #define RESET_STYLE 52 #endif 53 54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 55 56 #define BAD_ALLOC 2 57 58 #ifndef OBUF_SIZE 59 #define OBUF_SIZE (8 * 1024) 60 #endif 61 62 const char* info = "" 63 "dessv [filenames...]\n" 64 "\n" 65 "Turn Space(s)-Separated Values (SSV) into Tab-Separated Values (TSV), where\n" 66 "both leading and trailing spaces from input lines are ignored.\n" 67 ""; 68 69 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at 70 // stdout, it considerably speeds up this app, as intended 71 typedef struct bufwriter { 72 // buf is the buffer proper 73 unsigned char* buf; 74 75 // len is how many bytes of the buffer are currently being used 76 size_t len; 77 78 // cap is the capacity of the buffer, or the most bytes it can hold 79 size_t cap; 80 81 // out is the destination of all that's written into the buffer 82 FILE* out; 83 } bufwriter; 84 85 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) { 86 w->buf = b; 87 w->len = 0; 88 w->cap = cap; 89 w->out = out; 90 } 91 92 void write_byte(bufwriter* w, unsigned char b) { 93 if (w->len < w->cap) { 94 w->buf[w->len++] = b; 95 return; 96 } 97 98 fwrite(w->buf, 1, w->cap, w->out); 99 w->buf[0] = b; 100 w->len = 1; 101 } 102 103 // write_bytes does as it says, minimizing the number of calls to fwrite 104 void write_bytes(bufwriter* w, const unsigned char* src, size_t len) { 105 const size_t rem = w->cap - w->len; 106 if (len < rem) { 107 memcpy(w->buf + w->len, src, len); 108 w->len += len; 109 return; 110 } 111 112 for (size_t i = 0; i < len; i++) { 113 write_byte(w, src[i]); 114 } 115 } 116 117 void flush(bufwriter* w) { 118 if (w->len > 0) { 119 fwrite(w->buf, 1, w->len, w->out); 120 } 121 w->len = 0; 122 fflush(w->out); 123 } 124 125 // slice is a growable region of bytes in memory 126 typedef struct slice { 127 // ptr is the starting place of the region 128 unsigned char* ptr; 129 130 // cap is how many bytes the memory region has available 131 size_t cap; 132 } slice; 133 134 bool starts_with_bom(const unsigned char* b, const size_t n) { 135 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 136 } 137 138 bool has_tabs(const unsigned char* b, const size_t n) { 139 for (size_t i = 0; i < n; i++) { 140 if (b[i] == '\t') { 141 return true; 142 } 143 } 144 return false; 145 } 146 147 size_t count_tabs(const unsigned char* b, const size_t n) { 148 size_t tabs = 0; 149 for (size_t i = 0; i < n; i++) { 150 tabs += (b[i] == '\t'); 151 } 152 return tabs; 153 } 154 155 // write_tsv_line returns the number of tab-separated values emitted; current 156 // line isn't ended with a line-feed, which must be emitted separately 157 size_t write_tsv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) { 158 // ignore trailing CRLF byte-pairs, or trailing line-feed bytes 159 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 160 len -= 2; 161 } else if (len >= 1 && p[len - 1] == '\n') { 162 len--; 163 } 164 165 write_bytes(w, p, len); 166 return count_tabs(p, len) + 1; 167 } 168 169 // write_ssv_line returns the number of tab-separated values emitted; current 170 // line isn't ended with a line-feed, which must be emitted separately 171 size_t write_ssv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) { 172 // ignore trailing CRLF byte-pairs, or trailing line-feed bytes 173 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 174 len -= 2; 175 } else if (len >= 1 && p[len - 1] == '\n') { 176 len--; 177 } 178 179 // ignore leading spaces 180 while (len > 0 && p[0] == ' ') { 181 p++; 182 len--; 183 } 184 185 // trailing spaces are inconsequential, since there's nothing to follow 186 // them, which in turn prevents their normal single-tab substitutes from 187 // being emitted 188 189 size_t i = 0; 190 size_t items = 0; 191 192 for (; i < len && items < width; i++) { 193 unsigned char b = p[i]; 194 195 if (b == ' ') { 196 while (i < len && p[i] == ' ') { 197 i++; 198 } 199 if (i == len) { 200 continue; 201 } 202 203 items++; 204 write_byte(w, '\t'); 205 206 b = p[i]; 207 } 208 209 write_byte(w, b); 210 } 211 212 if (i < len) { 213 write_bytes(w, p + i, len - i); 214 } 215 216 return items; 217 } 218 219 void handle_reader(bufwriter* w, FILE* r, slice* line, bool live_lines) { 220 size_t (*write_items)(bufwriter*, const unsigned char*, size_t, size_t) = NULL; 221 size_t items = 0; 222 223 for (size_t i = 0; !feof(w->out); i++) { 224 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 225 if (len < 0) { 226 break; 227 } 228 229 if (line->ptr == NULL) { 230 fprintf(stderr, "\n"); 231 fprintf(stderr, ERROR_LINE("out of memory")); 232 exit(BAD_ALLOC); 233 } 234 235 unsigned char* ptr = line->ptr; 236 237 // turn trailing carriage-returns into line-feeds 238 if (len >= 1 && ptr[len - 1] == '\r') { 239 ptr[len - 1] = '\n'; 240 } 241 242 // get rid of carriage-returns preceding line-feeds 243 if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') { 244 ptr[len - 2] = '\n'; 245 len--; 246 } 247 248 // 1st line: figure out if lines are already TSV, remember item-count, 249 // and ignore UTF-8 byte-order marks 250 if (i == 0) { 251 if (starts_with_bom(ptr, len)) { 252 ptr += 3; 253 len -= 3; 254 } 255 256 const bool tsv = has_tabs(ptr, len); 257 write_items = tsv ? write_tsv_line : write_ssv_line; 258 items = write_items(w, ptr, len, SIZE_MAX); 259 write_byte(w, '\n'); 260 if (live_lines) { 261 flush(w); 262 } 263 continue; 264 } 265 266 // write normal data lines 267 size_t got = write_items(w, ptr, len, items); 268 // add empty fields, when missing trailing ones 269 for (size_t j = got; j < items; j++) { 270 write_byte(w, '\t'); 271 } 272 273 write_byte(w, '\n'); 274 if (live_lines) { 275 flush(w); 276 } 277 } 278 } 279 280 // handle_file handles data from the filename given; returns false only when 281 // the file can't be opened 282 bool handle_file(bufwriter* w, const char* path, slice* line, bool live_lines) { 283 FILE* f = fopen(path, "rb"); 284 if (f == NULL) { 285 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 286 return false; 287 } 288 289 handle_reader(w, f, line, live_lines); 290 fclose(f); 291 return true; 292 } 293 294 // run returns the number of errors 295 int run(int argc, char** argv, FILE* w, bool live_lines) { 296 unsigned char outbuf[OBUF_SIZE]; 297 bufwriter bw; 298 299 size_t dashes = 0; 300 for (int i = 1; i < argc; i++) { 301 if (strcmp(argv[i], "-") == 0) { 302 dashes++; 303 } 304 } 305 306 if (dashes > 1) { 307 const char* m = "can't use the standard input (dash) more than once"; 308 fprintf(stderr, ERROR_LINE("%s"), m); 309 return 1; 310 } 311 312 slice line; 313 line.cap = 32 * 1024; 314 line.ptr = malloc(line.cap); 315 316 if (live_lines) { 317 if (line.ptr == NULL) { 318 fprintf(stderr, ERROR_LINE("out of memory")); 319 exit(BAD_ALLOC); 320 } 321 } 322 323 init_bufwriter(&bw, w, outbuf, sizeof(outbuf)); 324 325 size_t errors = 0; 326 for (int i = 1; i < argc && !feof(w); i++) { 327 if (strcmp(argv[i], "-") == 0) { 328 handle_reader(&bw, stdin, &line, live_lines); 329 continue; 330 } 331 332 if (!handle_file(&bw, argv[i], &line, live_lines)) { 333 errors++; 334 } 335 } 336 337 // use stdin when not given any filepaths 338 if (argc <= 1) { 339 handle_reader(&bw, stdin, &line, live_lines); 340 } 341 342 free(line.ptr); 343 flush(&bw); 344 return errors; 345 } 346 347 int main(int argc, char** argv) { 348 #ifdef _WIN32 349 setmode(fileno(stdin), O_BINARY); 350 // ensure output lines end in LF instead of CRLF on windows 351 setmode(fileno(stdout), O_BINARY); 352 setmode(fileno(stderr), O_BINARY); 353 #endif 354 355 if (argc > 1) { 356 if ( 357 strcmp(argv[1], "-h") == 0 || 358 strcmp(argv[1], "-help") == 0 || 359 strcmp(argv[1], "--h") == 0 || 360 strcmp(argv[1], "--help") == 0 361 ) { 362 fprintf(stdout, "%s", info); 363 return 0; 364 } 365 } 366 367 const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0; 368 if (live_lines) { 369 setvbuf(stdout, NULL, _IOLBF, 0); 370 } else { 371 setvbuf(stdout, NULL, _IOFBF, 0); 372 } 373 return run(argc, argv, stdout, live_lines) == 0 ? 0 : 1; 374 }