File: dessv.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O2 -march=native -mtune=native -flto -o ./dessv ./dessv.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 
  38 #ifdef _WIN32
  39 #include <fcntl.h>
  40 #include <windows.h>
  41 #endif
  42 
  43 #ifdef RED_ERRORS
  44 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  45 #ifdef __APPLE__
  46 #define ERROR_STYLE "\x1b[31m"
  47 #endif
  48 #define RESET_STYLE "\x1b[0m"
  49 #else
  50 #define ERROR_STYLE
  51 #define RESET_STYLE
  52 #endif
  53 
  54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  55 
  56 #define BAD_ALLOC 2
  57 
  58 #ifndef OBUF_SIZE
  59 #define OBUF_SIZE (8 * 1024)
  60 #endif
  61 
  62 const char* info = ""
  63 "dessv [filenames...]\n"
  64 "\n"
  65 "Turn Space(s)-Separated Values (SSV) into Tab-Separated Values (TSV), where\n"
  66 "both leading and trailing spaces from input lines are ignored.\n"
  67 "";
  68 
  69 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at
  70 // stdout, it considerably speeds up this app, as intended
  71 typedef struct bufwriter {
  72     // buf is the buffer proper
  73     unsigned char* buf;
  74 
  75     // len is how many bytes of the buffer are currently being used
  76     size_t len;
  77 
  78     // cap is the capacity of the buffer, or the most bytes it can hold
  79     size_t cap;
  80 
  81     // out is the destination of all that's written into the buffer
  82     FILE* out;
  83 } bufwriter;
  84 
  85 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) {
  86     w->buf = b;
  87     w->len = 0;
  88     w->cap = cap;
  89     w->out = out;
  90 }
  91 
  92 void write_byte(bufwriter* w, unsigned char b) {
  93     if (w->len < w->cap) {
  94         w->buf[w->len++] = b;
  95         return;
  96     }
  97 
  98     fwrite(w->buf, 1, w->cap, w->out);
  99     w->buf[0] = b;
 100     w->len = 1;
 101 }
 102 
 103 // write_bytes does as it says, minimizing the number of calls to fwrite
 104 void write_bytes(bufwriter* w, const unsigned char* src, size_t len) {
 105     const size_t rem = w->cap - w->len;
 106     if (len < rem) {
 107         memcpy(w->buf + w->len, src, len);
 108         w->len += len;
 109         return;
 110     }
 111 
 112     for (size_t i = 0; i < len; i++) {
 113         write_byte(w, src[i]);
 114     }
 115 }
 116 
 117 void flush(bufwriter* w) {
 118     if (w->len > 0) {
 119         fwrite(w->buf, 1, w->len, w->out);
 120     }
 121     w->len = 0;
 122     fflush(w->out);
 123 }
 124 
 125 // slice is a growable region of bytes in memory
 126 typedef struct slice {
 127     // ptr is the starting place of the region
 128     unsigned char* ptr;
 129 
 130     // cap is how many bytes the memory region has available
 131     size_t cap;
 132 } slice;
 133 
 134 bool starts_with_bom(const unsigned char* b, const size_t n) {
 135     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 136 }
 137 
 138 bool has_tabs(const unsigned char* b, const size_t n) {
 139     for (size_t i = 0; i < n; i++) {
 140         if (b[i] == '\t') {
 141             return true;
 142         }
 143     }
 144     return false;
 145 }
 146 
 147 size_t count_tabs(const unsigned char* b, const size_t n) {
 148     size_t tabs = 0;
 149     for (size_t i = 0; i < n; i++) {
 150         tabs += (b[i] == '\t');
 151     }
 152     return tabs;
 153 }
 154 
 155 // write_tsv_line returns the number of tab-separated values emitted; current
 156 // line isn't ended with a line-feed, which must be emitted separately
 157 size_t write_tsv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) {
 158     // ignore trailing CRLF byte-pairs, or trailing line-feed bytes
 159     if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 160         len -= 2;
 161     } else if (len >= 1 && p[len - 1] == '\n') {
 162         len--;
 163     }
 164 
 165     write_bytes(w, p, len);
 166     return count_tabs(p, len) + 1;
 167 }
 168 
 169 // write_ssv_line returns the number of tab-separated values emitted; current
 170 // line isn't ended with a line-feed, which must be emitted separately
 171 size_t write_ssv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) {
 172     // ignore trailing CRLF byte-pairs, or trailing line-feed bytes
 173     if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 174         len -= 2;
 175     } else if (len >= 1 && p[len - 1] == '\n') {
 176         len--;
 177     }
 178 
 179     // ignore leading spaces
 180     while (len > 0 && p[0] == ' ') {
 181         p++;
 182         len--;
 183     }
 184 
 185     // trailing spaces are inconsequential, since there's nothing to follow
 186     // them, which in turn prevents their normal single-tab substitutes from
 187     // being emitted
 188 
 189     size_t i = 0;
 190     size_t items = 0;
 191 
 192     for (; i < len && items < width; i++) {
 193         unsigned char b = p[i];
 194 
 195         if (b == ' ') {
 196             i++;
 197             // split looping condition to make automatic code-checkers happy
 198             while (i < len) {
 199                 if (p[i] != ' ') {
 200                     break;
 201                 }
 202                 i++;
 203             }
 204             if (i == len) {
 205                 continue;
 206             }
 207 
 208             items++;
 209             write_byte(w, '\t');
 210 
 211             b = p[i];
 212         }
 213 
 214         write_byte(w, b);
 215     }
 216 
 217     if (i < len) {
 218         write_bytes(w, p + i, len - i);
 219     }
 220 
 221     return items;
 222 }
 223 
 224 void handle_reader(bufwriter* w, FILE* r, slice* line, bool live_lines) {
 225     size_t (*write_items)(bufwriter*, const unsigned char*, size_t, size_t) = NULL;
 226     size_t items = 0;
 227 
 228     for (size_t i = 0; !feof(w->out); i++) {
 229         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 230         if (len < 0) {
 231             break;
 232         }
 233 
 234         if (line->ptr == NULL) {
 235             fprintf(stderr, "\n");
 236             fprintf(stderr, ERROR_LINE("out of memory"));
 237             exit(BAD_ALLOC);
 238         }
 239 
 240         unsigned char* ptr = line->ptr;
 241 
 242         // turn trailing carriage-returns into line-feeds
 243         if (len >= 1 && ptr[len - 1] == '\r') {
 244             ptr[len - 1] = '\n';
 245         }
 246 
 247         // get rid of carriage-returns preceding line-feeds
 248         if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') {
 249             ptr[len - 2] = '\n';
 250             len--;
 251         }
 252 
 253         // 1st line: figure out if lines are already TSV, remember item-count,
 254         // and ignore UTF-8 byte-order marks
 255         if (i == 0) {
 256             if (starts_with_bom(ptr, len)) {
 257                 ptr += 3;
 258                 len -= 3;
 259             }
 260 
 261             const bool tsv = has_tabs(ptr, len);
 262             write_items = tsv ? write_tsv_line : write_ssv_line;
 263             items = write_items(w, ptr, len, SIZE_MAX);
 264             write_byte(w, '\n');
 265             if (live_lines) {
 266                 flush(w);
 267             }
 268             continue;
 269         }
 270 
 271         // write normal data lines
 272         size_t got = write_items(w, ptr, len, items);
 273         // add empty fields, when missing trailing ones
 274         for (size_t j = got; j < items; j++) {
 275             write_byte(w, '\t');
 276         }
 277 
 278         write_byte(w, '\n');
 279         if (live_lines) {
 280             flush(w);
 281         }
 282     }
 283 }
 284 
 285 // handle_file handles data from the filename given; returns false only when
 286 // the file can't be opened
 287 bool handle_file(bufwriter* w, const char* path, slice* line, bool live_lines) {
 288     FILE* f = fopen(path, "rb");
 289     if (f == NULL) {
 290         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 291         return false;
 292     }
 293 
 294     handle_reader(w, f, line, live_lines);
 295     fclose(f);
 296     return true;
 297 }
 298 
 299 // run returns the number of errors
 300 int run(int argc, char** argv, FILE* w, bool live_lines) {
 301     unsigned char outbuf[OBUF_SIZE];
 302     bufwriter bw;
 303 
 304     size_t dashes = 0;
 305     for (int i = 1; i < argc; i++) {
 306         if (strcmp(argv[i], "-") == 0) {
 307             dashes++;
 308         }
 309     }
 310 
 311     if (dashes > 1) {
 312         const char* m = "can't use the standard input (dash) more than once";
 313         fprintf(stderr, ERROR_LINE("%s"), m);
 314         return 1;
 315     }
 316 
 317     slice line;
 318     line.cap = 32 * 1024;
 319     line.ptr = malloc(line.cap);
 320 
 321     if (live_lines) {
 322         if (line.ptr == NULL) {
 323             fprintf(stderr, ERROR_LINE("out of memory"));
 324             exit(BAD_ALLOC);
 325         }
 326     }
 327 
 328     init_bufwriter(&bw, w, outbuf, sizeof(outbuf));
 329 
 330     size_t errors = 0;
 331     for (int i = 1; i < argc && !feof(w); i++) {
 332         if (strcmp(argv[i], "-") == 0) {
 333             handle_reader(&bw, stdin, &line, live_lines);
 334             continue;
 335         }
 336 
 337         if (!handle_file(&bw, argv[i], &line, live_lines)) {
 338             errors++;
 339         }
 340     }
 341 
 342     // use stdin when not given any filepaths
 343     if (argc <= 1) {
 344         handle_reader(&bw, stdin, &line, live_lines);
 345     }
 346 
 347     free(line.ptr);
 348     flush(&bw);
 349     return errors;
 350 }
 351 
 352 int main(int argc, char** argv) {
 353 #ifdef _WIN32
 354     setmode(fileno(stdin), O_BINARY);
 355     // ensure output lines end in LF instead of CRLF on windows
 356     setmode(fileno(stdout), O_BINARY);
 357     setmode(fileno(stderr), O_BINARY);
 358 #endif
 359 
 360     if (argc > 1) {
 361         if (
 362             strcmp(argv[1], "-h") == 0 ||
 363             strcmp(argv[1], "-help") == 0 ||
 364             strcmp(argv[1], "--h") == 0 ||
 365             strcmp(argv[1], "--help") == 0
 366         ) {
 367             fprintf(stdout, "%s", info);
 368             return 0;
 369         }
 370     }
 371 
 372     const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0;
 373     if (live_lines) {
 374         setvbuf(stdout, NULL, _IOLBF, 0);
 375     } else {
 376         setvbuf(stdout, NULL, _IOFBF, 0);
 377     }
 378     return run(argc, argv, stdout, live_lines) == 0 ? 0 : 1;
 379 }