File: dessv.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O2 -march=native -mtune=native -flto -o ./dessv ./dessv.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 
  38 #ifdef _WIN32
  39 #include <fcntl.h>
  40 #include <windows.h>
  41 #endif
  42 
  43 #ifdef RED_ERRORS
  44 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  45 #ifdef __APPLE__
  46 #define ERROR_STYLE "\x1b[31m"
  47 #endif
  48 #define RESET_STYLE "\x1b[0m"
  49 #else
  50 #define ERROR_STYLE
  51 #define RESET_STYLE
  52 #endif
  53 
  54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  55 
  56 #define BAD_ALLOC 2
  57 
  58 #ifndef OBUF_SIZE
  59 #define OBUF_SIZE (8 * 1024)
  60 #endif
  61 
  62 const char* info = ""
  63 "dessv [filenames...]\n"
  64 "\n"
  65 "Turn Space(s)-Separated Values (SSV) into Tab-Separated Values (TSV), where\n"
  66 "both leading and trailing spaces from input lines are ignored.\n"
  67 "";
  68 
  69 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at
  70 // stdout, it considerably speeds up this app, as intended
  71 typedef struct bufwriter {
  72     // buf is the buffer proper
  73     unsigned char* buf;
  74 
  75     // len is how many bytes of the buffer are currently being used
  76     size_t len;
  77 
  78     // cap is the capacity of the buffer, or the most bytes it can hold
  79     size_t cap;
  80 
  81     // out is the destination of all that's written into the buffer
  82     FILE* out;
  83 } bufwriter;
  84 
  85 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) {
  86     w->buf = b;
  87     w->len = 0;
  88     w->cap = cap;
  89     w->out = out;
  90 }
  91 
  92 void write_byte(bufwriter* w, unsigned char b) {
  93     if (w->len < w->cap) {
  94         w->buf[w->len++] = b;
  95         return;
  96     }
  97 
  98     fwrite(w->buf, 1, w->cap, w->out);
  99     w->buf[0] = b;
 100     w->len = 1;
 101 }
 102 
 103 // write_bytes does as it says, minimizing the number of calls to fwrite
 104 void write_bytes(bufwriter* w, const unsigned char* src, size_t len) {
 105     const size_t rem = w->cap - w->len;
 106     if (len < rem) {
 107         memcpy(w->buf + w->len, src, len);
 108         w->len += len;
 109         return;
 110     }
 111 
 112     for (size_t i = 0; i < len; i++) {
 113         write_byte(w, src[i]);
 114     }
 115 }
 116 
 117 void flush(bufwriter* w) {
 118     if (w->len > 0) {
 119         fwrite(w->buf, 1, w->len, w->out);
 120     }
 121     w->len = 0;
 122     fflush(w->out);
 123 }
 124 
 125 // slice is a growable region of bytes in memory
 126 typedef struct slice {
 127     // ptr is the starting place of the region
 128     unsigned char* ptr;
 129 
 130     // cap is how many bytes the memory region has available
 131     size_t cap;
 132 } slice;
 133 
 134 bool starts_with_bom(const unsigned char* b, const size_t n) {
 135     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 136 }
 137 
 138 bool has_tabs(const unsigned char* b, const size_t n) {
 139     for (size_t i = 0; i < n; i++) {
 140         if (b[i] == '\t') {
 141             return true;
 142         }
 143     }
 144     return false;
 145 }
 146 
 147 size_t count_tabs(const unsigned char* b, const size_t n) {
 148     size_t tabs = 0;
 149     for (size_t i = 0; i < n; i++) {
 150         tabs += (b[i] == '\t');
 151     }
 152     return tabs;
 153 }
 154 
 155 // write_tsv_line returns the number of tab-separated values emitted; current
 156 // line isn't ended with a line-feed, which must be emitted separately
 157 size_t write_tsv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) {
 158     // ignore trailing CRLF byte-pairs, or trailing line-feed bytes
 159     if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 160         len -= 2;
 161     } else if (len >= 1 && p[len - 1] == '\n') {
 162         len--;
 163     }
 164 
 165     write_bytes(w, p, len);
 166     return count_tabs(p, len) + 1;
 167 }
 168 
 169 // write_ssv_line returns the number of tab-separated values emitted; current
 170 // line isn't ended with a line-feed, which must be emitted separately
 171 size_t write_ssv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) {
 172     // ignore trailing CRLF byte-pairs, or trailing line-feed bytes
 173     if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 174         len -= 2;
 175     } else if (len >= 1 && p[len - 1] == '\n') {
 176         len--;
 177     }
 178 
 179     // ignore leading spaces
 180     while (len > 0 && p[0] == ' ') {
 181         p++;
 182         len--;
 183     }
 184 
 185     // trailing spaces are inconsequential, since there's nothing to follow
 186     // them, which in turn prevents their normal single-tab substitutes from
 187     // being emitted
 188 
 189     size_t i = 0;
 190     size_t items = 0;
 191 
 192     for (; i < len && items < width; i++) {
 193         unsigned char b = p[i];
 194 
 195         if (b == ' ') {
 196             while (i < len && p[i] == ' ') {
 197                 i++;
 198             }
 199             if (i == len) {
 200                 continue;
 201             }
 202 
 203             items++;
 204             write_byte(w, '\t');
 205 
 206             b = p[i];
 207         }
 208 
 209         write_byte(w, b);
 210     }
 211 
 212     if (i < len) {
 213         write_bytes(w, p + i, len - i);
 214     }
 215 
 216     return items;
 217 }
 218 
 219 void handle_reader(bufwriter* w, FILE* r, slice* line, bool live_lines) {
 220     size_t (*write_items)(bufwriter*, const unsigned char*, size_t, size_t) = NULL;
 221     size_t items = 0;
 222 
 223     for (size_t i = 0; !feof(w->out); i++) {
 224         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 225         if (len < 0) {
 226             break;
 227         }
 228 
 229         if (line->ptr == NULL) {
 230             fprintf(stderr, "\n");
 231             fprintf(stderr, ERROR_LINE("out of memory"));
 232             exit(BAD_ALLOC);
 233         }
 234 
 235         unsigned char* ptr = line->ptr;
 236 
 237         // turn trailing carriage-returns into line-feeds
 238         if (len >= 1 && ptr[len - 1] == '\r') {
 239             ptr[len - 1] = '\n';
 240         }
 241 
 242         // get rid of carriage-returns preceding line-feeds
 243         if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') {
 244             ptr[len - 2] = '\n';
 245             len--;
 246         }
 247 
 248         // 1st line: figure out if lines are already TSV, remember item-count,
 249         // and ignore UTF-8 byte-order marks
 250         if (i == 0) {
 251             if (starts_with_bom(ptr, len)) {
 252                 ptr += 3;
 253                 len -= 3;
 254             }
 255 
 256             const bool tsv = has_tabs(ptr, len);
 257             write_items = tsv ? write_tsv_line : write_ssv_line;
 258             items = write_items(w, ptr, len, SIZE_MAX);
 259             write_byte(w, '\n');
 260             if (live_lines) {
 261                 flush(w);
 262             }
 263             continue;
 264         }
 265 
 266         // write normal data lines
 267         size_t got = write_items(w, ptr, len, items);
 268         // add empty fields, when missing trailing ones
 269         for (size_t j = got; j < items; j++) {
 270             write_byte(w, '\t');
 271         }
 272 
 273         write_byte(w, '\n');
 274         if (live_lines) {
 275             flush(w);
 276         }
 277     }
 278 }
 279 
 280 // handle_file handles data from the filename given; returns false only when
 281 // the file can't be opened
 282 bool handle_file(bufwriter* w, const char* path, slice* line, bool live_lines) {
 283     FILE* f = fopen(path, "rb");
 284     if (f == NULL) {
 285         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 286         return false;
 287     }
 288 
 289     handle_reader(w, f, line, live_lines);
 290     fclose(f);
 291     return true;
 292 }
 293 
 294 // run returns the number of errors
 295 int run(int argc, char** argv, FILE* w, bool live_lines) {
 296     unsigned char outbuf[OBUF_SIZE];
 297     bufwriter bw;
 298 
 299     size_t dashes = 0;
 300     for (int i = 1; i < argc; i++) {
 301         if (strcmp(argv[i], "-") == 0) {
 302             dashes++;
 303         }
 304     }
 305 
 306     if (dashes > 1) {
 307         const char* m = "can't use the standard input (dash) more than once";
 308         fprintf(stderr, ERROR_LINE("%s"), m);
 309         return 1;
 310     }
 311 
 312     slice line;
 313     line.cap = 32 * 1024;
 314     line.ptr = malloc(line.cap);
 315 
 316     if (live_lines) {
 317         if (line.ptr == NULL) {
 318             fprintf(stderr, ERROR_LINE("out of memory"));
 319             exit(BAD_ALLOC);
 320         }
 321     }
 322 
 323     init_bufwriter(&bw, w, outbuf, sizeof(outbuf));
 324 
 325     size_t errors = 0;
 326     for (int i = 1; i < argc && !feof(w); i++) {
 327         if (strcmp(argv[i], "-") == 0) {
 328             handle_reader(&bw, stdin, &line, live_lines);
 329             continue;
 330         }
 331 
 332         if (!handle_file(&bw, argv[i], &line, live_lines)) {
 333             errors++;
 334         }
 335     }
 336 
 337     // use stdin when not given any filepaths
 338     if (argc <= 1) {
 339         handle_reader(&bw, stdin, &line, live_lines);
 340     }
 341 
 342     free(line.ptr);
 343     flush(&bw);
 344     return errors;
 345 }
 346 
 347 int main(int argc, char** argv) {
 348 #ifdef _WIN32
 349     setmode(fileno(stdin), O_BINARY);
 350     // ensure output lines end in LF instead of CRLF on windows
 351     setmode(fileno(stdout), O_BINARY);
 352     setmode(fileno(stderr), O_BINARY);
 353 #endif
 354 
 355     if (argc > 1) {
 356         if (
 357             strcmp(argv[1], "-h") == 0 ||
 358             strcmp(argv[1], "-help") == 0 ||
 359             strcmp(argv[1], "--h") == 0 ||
 360             strcmp(argv[1], "--help") == 0
 361         ) {
 362             fprintf(stdout, "%s", info);
 363             return 0;
 364         }
 365     }
 366 
 367     const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0;
 368     if (live_lines) {
 369         setvbuf(stdout, NULL, _IOLBF, 0);
 370     } else {
 371         setvbuf(stdout, NULL, _IOFBF, 0);
 372     }
 373     return run(argc, argv, stdout, live_lines) == 0 ? 0 : 1;
 374 }