File: dessv.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O2 -march=native -mtune=native -flto -o ./dessv ./dessv.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <unistd.h> 37 38 #ifdef _WIN32 39 #include <fcntl.h> 40 #include <windows.h> 41 #endif 42 43 #ifdef RED_ERRORS 44 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 45 #ifdef __APPLE__ 46 #define ERROR_STYLE "\x1b[31m" 47 #endif 48 #define RESET_STYLE "\x1b[0m" 49 #else 50 #define ERROR_STYLE 51 #define RESET_STYLE 52 #endif 53 54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 55 56 #define BAD_ALLOC 2 57 58 #ifndef OBUF_SIZE 59 #define OBUF_SIZE (8 * 1024) 60 #endif 61 62 const char* info = "" 63 "dessv [filenames...]\n" 64 "\n" 65 "Turn Space(s)-Separated Values (SSV) into Tab-Separated Values (TSV), where\n" 66 "both leading and trailing spaces from input lines are ignored.\n" 67 ""; 68 69 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at 70 // stdout, it considerably speeds up this app, as intended 71 typedef struct bufwriter { 72 // buf is the buffer proper 73 unsigned char* buf; 74 75 // len is how many bytes of the buffer are currently being used 76 size_t len; 77 78 // cap is the capacity of the buffer, or the most bytes it can hold 79 size_t cap; 80 81 // out is the destination of all that's written into the buffer 82 FILE* out; 83 } bufwriter; 84 85 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) { 86 w->buf = b; 87 w->len = 0; 88 w->cap = cap; 89 w->out = out; 90 } 91 92 void write_byte(bufwriter* w, unsigned char b) { 93 if (w->len < w->cap) { 94 w->buf[w->len++] = b; 95 return; 96 } 97 98 fwrite(w->buf, 1, w->cap, w->out); 99 w->buf[0] = b; 100 w->len = 1; 101 } 102 103 // write_bytes does as it says, minimizing the number of calls to fwrite 104 void write_bytes(bufwriter* w, const unsigned char* src, size_t len) { 105 const size_t rem = w->cap - w->len; 106 if (len < rem) { 107 memcpy(w->buf + w->len, src, len); 108 w->len += len; 109 return; 110 } 111 112 for (size_t i = 0; i < len; i++) { 113 write_byte(w, src[i]); 114 } 115 } 116 117 void flush(bufwriter* w) { 118 if (w->len > 0) { 119 fwrite(w->buf, 1, w->len, w->out); 120 } 121 w->len = 0; 122 fflush(w->out); 123 } 124 125 // slice is a growable region of bytes in memory 126 typedef struct slice { 127 // ptr is the starting place of the region 128 unsigned char* ptr; 129 130 // cap is how many bytes the memory region has available 131 size_t cap; 132 } slice; 133 134 bool starts_with_bom(const unsigned char* b, const size_t n) { 135 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 136 } 137 138 bool has_tabs(const unsigned char* b, const size_t n) { 139 for (size_t i = 0; i < n; i++) { 140 if (b[i] == '\t') { 141 return true; 142 } 143 } 144 return false; 145 } 146 147 size_t count_tabs(const unsigned char* b, const size_t n) { 148 size_t tabs = 0; 149 for (size_t i = 0; i < n; i++) { 150 tabs += (b[i] == '\t'); 151 } 152 return tabs; 153 } 154 155 // write_tsv_line returns the number of tab-separated values emitted; current 156 // line isn't ended with a line-feed, which must be emitted separately 157 size_t write_tsv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) { 158 // ignore trailing CRLF byte-pairs, or trailing line-feed bytes 159 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 160 len -= 2; 161 } else if (len >= 1 && p[len - 1] == '\n') { 162 len--; 163 } 164 165 write_bytes(w, p, len); 166 return count_tabs(p, len) + 1; 167 } 168 169 // write_ssv_line returns the number of tab-separated values emitted; current 170 // line isn't ended with a line-feed, which must be emitted separately 171 size_t write_ssv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) { 172 // ignore trailing CRLF byte-pairs, or trailing line-feed bytes 173 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 174 len -= 2; 175 } else if (len >= 1 && p[len - 1] == '\n') { 176 len--; 177 } 178 179 // ignore leading spaces 180 while (len > 0 && p[0] == ' ') { 181 p++; 182 len--; 183 } 184 185 // trailing spaces are inconsequential, since there's nothing to follow 186 // them, which in turn prevents their normal single-tab substitutes from 187 // being emitted 188 189 size_t i = 0; 190 size_t items = 0; 191 192 for (; i < len && items < width; i++) { 193 unsigned char b = p[i]; 194 195 if (b == ' ') { 196 i++; 197 // split looping condition to make automatic code-checkers happy 198 while (i < len) { 199 if (p[i] != ' ') { 200 break; 201 } 202 i++; 203 } 204 if (i == len) { 205 continue; 206 } 207 208 items++; 209 write_byte(w, '\t'); 210 211 b = p[i]; 212 } 213 214 write_byte(w, b); 215 } 216 217 if (i < len) { 218 write_bytes(w, p + i, len - i); 219 } 220 221 return items; 222 } 223 224 void handle_reader(bufwriter* w, FILE* r, slice* line, bool live_lines) { 225 size_t (*write_items)(bufwriter*, const unsigned char*, size_t, size_t) = NULL; 226 size_t items = 0; 227 228 for (size_t i = 0; !feof(w->out); i++) { 229 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 230 if (len < 0) { 231 break; 232 } 233 234 if (line->ptr == NULL) { 235 fprintf(stderr, "\n"); 236 fprintf(stderr, ERROR_LINE("out of memory")); 237 exit(BAD_ALLOC); 238 } 239 240 unsigned char* ptr = line->ptr; 241 242 // turn trailing carriage-returns into line-feeds 243 if (len >= 1 && ptr[len - 1] == '\r') { 244 ptr[len - 1] = '\n'; 245 } 246 247 // get rid of carriage-returns preceding line-feeds 248 if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') { 249 ptr[len - 2] = '\n'; 250 len--; 251 } 252 253 // 1st line: figure out if lines are already TSV, remember item-count, 254 // and ignore UTF-8 byte-order marks 255 if (i == 0) { 256 if (starts_with_bom(ptr, len)) { 257 ptr += 3; 258 len -= 3; 259 } 260 261 const bool tsv = has_tabs(ptr, len); 262 write_items = tsv ? write_tsv_line : write_ssv_line; 263 items = write_items(w, ptr, len, SIZE_MAX); 264 write_byte(w, '\n'); 265 if (live_lines) { 266 flush(w); 267 } 268 continue; 269 } 270 271 // write normal data lines 272 size_t got = write_items(w, ptr, len, items); 273 // add empty fields, when missing trailing ones 274 for (size_t j = got; j < items; j++) { 275 write_byte(w, '\t'); 276 } 277 278 write_byte(w, '\n'); 279 if (live_lines) { 280 flush(w); 281 } 282 } 283 } 284 285 // handle_file handles data from the filename given; returns false only when 286 // the file can't be opened 287 bool handle_file(bufwriter* w, const char* path, slice* line, bool live_lines) { 288 FILE* f = fopen(path, "rb"); 289 if (f == NULL) { 290 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 291 return false; 292 } 293 294 handle_reader(w, f, line, live_lines); 295 fclose(f); 296 return true; 297 } 298 299 // run returns the number of errors 300 int run(int argc, char** argv, FILE* w, bool live_lines) { 301 unsigned char outbuf[OBUF_SIZE]; 302 bufwriter bw; 303 304 size_t dashes = 0; 305 for (int i = 1; i < argc; i++) { 306 if (strcmp(argv[i], "-") == 0) { 307 dashes++; 308 } 309 } 310 311 if (dashes > 1) { 312 const char* m = "can't use the standard input (dash) more than once"; 313 fprintf(stderr, ERROR_LINE("%s"), m); 314 return 1; 315 } 316 317 slice line; 318 line.cap = 32 * 1024; 319 line.ptr = malloc(line.cap); 320 321 if (live_lines) { 322 if (line.ptr == NULL) { 323 fprintf(stderr, ERROR_LINE("out of memory")); 324 exit(BAD_ALLOC); 325 } 326 } 327 328 init_bufwriter(&bw, w, outbuf, sizeof(outbuf)); 329 330 size_t errors = 0; 331 for (int i = 1; i < argc && !feof(w); i++) { 332 if (strcmp(argv[i], "-") == 0) { 333 handle_reader(&bw, stdin, &line, live_lines); 334 continue; 335 } 336 337 if (!handle_file(&bw, argv[i], &line, live_lines)) { 338 errors++; 339 } 340 } 341 342 // use stdin when not given any filepaths 343 if (argc <= 1) { 344 handle_reader(&bw, stdin, &line, live_lines); 345 } 346 347 free(line.ptr); 348 flush(&bw); 349 return errors; 350 } 351 352 int main(int argc, char** argv) { 353 #ifdef _WIN32 354 setmode(fileno(stdin), O_BINARY); 355 // ensure output lines end in LF instead of CRLF on windows 356 setmode(fileno(stdout), O_BINARY); 357 setmode(fileno(stderr), O_BINARY); 358 #endif 359 360 if (argc > 1) { 361 if ( 362 strcmp(argv[1], "-h") == 0 || 363 strcmp(argv[1], "-help") == 0 || 364 strcmp(argv[1], "--h") == 0 || 365 strcmp(argv[1], "--help") == 0 366 ) { 367 fprintf(stdout, "%s", info); 368 return 0; 369 } 370 } 371 372 const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0; 373 if (live_lines) { 374 setvbuf(stdout, NULL, _IOLBF, 0); 375 } else { 376 setvbuf(stdout, NULL, _IOFBF, 0); 377 } 378 return run(argc, argv, stdout, live_lines) == 0 ? 0 : 1; 379 }