File: dessv.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./dessv ./dessv.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 
  38 #ifdef _WIN32
  39 #include <fcntl.h>
  40 #include <windows.h>
  41 #endif
  42 
  43 #ifdef RED_ERRORS
  44 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  45 #ifdef __APPLE__
  46 #define ERROR_STYLE "\x1b[31m"
  47 #endif
  48 #define RESET_STYLE "\x1b[0m"
  49 #else
  50 #define ERROR_STYLE
  51 #define RESET_STYLE
  52 #endif
  53 
  54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  55 
  56 #define BAD_ALLOC 2
  57 
  58 #ifndef OBUF_SIZE
  59 #define OBUF_SIZE (8 * 1024)
  60 #endif
  61 
  62 const char* info = ""
  63 "dessv [filenames...]\n"
  64 "\n"
  65 "Turn Space(s)-Separated Values (SSV) into Tab-Separated Values (TSV), where\n"
  66 "both leading and trailing spaces from input lines are ignored.\n"
  67 "";
  68 
  69 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at
  70 // stdout, it considerably speeds up this app, as intended
  71 typedef struct bufwriter {
  72     // buf is the buffer proper
  73     unsigned char* buf;
  74 
  75     // len is how many bytes of the buffer are currently being used
  76     size_t len;
  77 
  78     // cap is the capacity of the buffer, or the most bytes it can hold
  79     size_t cap;
  80 
  81     // out is the destination of all that's written into the buffer
  82     FILE* out;
  83 } bufwriter;
  84 
  85 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) {
  86     w->buf = b;
  87     w->len = 0;
  88     w->cap = cap;
  89     w->out = out;
  90 }
  91 
  92 void write_byte(bufwriter* w, unsigned char b) {
  93     if (w->len < w->cap) {
  94         w->buf[w->len++] = b;
  95         return;
  96     }
  97 
  98     fwrite(w->buf, 1, w->cap, w->out);
  99     w->buf[0] = b;
 100     w->len = 1;
 101 }
 102 
 103 // write_bytes does as it says, minimizing the number of calls to fwrite
 104 void write_bytes(bufwriter* w, const unsigned char* src, size_t len) {
 105     const size_t rem = w->cap - w->len;
 106     if (len < rem) {
 107         memcpy(w->buf + w->len, src, len);
 108         w->len += len;
 109         return;
 110     }
 111 
 112     for (size_t i = 0; i < len; i++) {
 113         write_byte(w, src[i]);
 114     }
 115 }
 116 
 117 void flush(bufwriter* w) {
 118     if (w->len > 0) {
 119         fwrite(w->buf, 1, w->len, w->out);
 120     }
 121     w->len = 0;
 122     fflush(w->out);
 123 }
 124 
 125 // slice is a growable region of bytes in memory
 126 typedef struct slice {
 127     // ptr is the starting place of the region
 128     unsigned char* ptr;
 129 
 130     // cap is how many bytes the memory region has available
 131     size_t cap;
 132 } slice;
 133 
 134 bool starts_with_bom(const unsigned char* b, const size_t n) {
 135     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 136 }
 137 
 138 bool has_tabs(const unsigned char* b, const size_t n) {
 139     for (size_t i = 0; i < n; i++) {
 140         if (b[i] == '\t') {
 141             return true;
 142         }
 143     }
 144     return false;
 145 }
 146 
 147 size_t count_tabs(const unsigned char* b, const size_t n) {
 148     size_t tabs = 0;
 149     for (size_t i = 0; i < n; i++) {
 150         tabs += (b[i] == '\t');
 151     }
 152     return tabs;
 153 }
 154 
 155 // write_tsv_line returns the number of tab-separated values emitted; current
 156 // line isn't ended with a line-feed, which must be emitted separately
 157 size_t write_tsv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) {
 158     // ignore trailing CRLF byte-pairs, or trailing line-feed bytes
 159     if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 160         len -= 2;
 161     } else if (len >= 1 && p[len - 1] == '\n') {
 162         len--;
 163     }
 164 
 165     write_bytes(w, p, len);
 166     return count_tabs(p, len) + 1;
 167 }
 168 
 169 // write_ssv_line returns the number of tab-separated values emitted; current
 170 // line isn't ended with a line-feed, which must be emitted separately
 171 size_t write_ssv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) {
 172     // ignore trailing CRLF byte-pairs, or trailing line-feed bytes
 173     if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 174         len -= 2;
 175     } else if (len >= 1 && p[len - 1] == '\n') {
 176         len--;
 177     }
 178 
 179     // ignore leading spaces
 180     while (len > 0 && p[0] == ' ') {
 181         p++;
 182         len--;
 183     }
 184 
 185     // trailing spaces are inconsequential, since there's nothing to follow
 186     // them, which in turn prevents their normal single-tab substitutes from
 187     // being emitted
 188 
 189     size_t items = 0;
 190     bool space = false;
 191     unsigned char sep = '\t';
 192 
 193     for (size_t i = 0; i < len; i++) {
 194         unsigned char b = p[i];
 195 
 196         if (b == ' ') {
 197             space = true;
 198             continue;
 199         }
 200 
 201         if (items == width) {
 202             sep = ' ';
 203         }
 204 
 205         if (space) {
 206             write_byte(w, sep);
 207             items++;
 208             space = false;
 209         }
 210         write_byte(w, b);
 211     }
 212 
 213     return items;
 214 }
 215 
 216 void handle_reader(bufwriter* w, FILE* r, slice* line, bool live_lines) {
 217     size_t (*write_items)(bufwriter*, const unsigned char*, size_t, size_t) = NULL;
 218     size_t items = 0;
 219 
 220     for (size_t i = 0; !feof(w->out); i++) {
 221         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 222         if (len < 0) {
 223             break;
 224         }
 225 
 226         if (line->ptr == NULL) {
 227             fprintf(stderr, "\n");
 228             fprintf(stderr, ERROR_LINE("out of memory"));
 229             exit(BAD_ALLOC);
 230         }
 231 
 232         unsigned char* ptr = line->ptr;
 233 
 234         // turn trailing carriage-returns into line-feeds
 235         if (len >= 1 && ptr[len - 1] == '\r') {
 236             ptr[len - 1] = '\n';
 237         }
 238 
 239         // get rid of carriage-returns preceding line-feeds
 240         if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') {
 241             ptr[len - 2] = '\n';
 242             len--;
 243         }
 244 
 245         // 1st line: figure out if lines are already TSV, remember item-count,
 246         // and ignore UTF-8 byte-order marks
 247         if (i == 0) {
 248             if (starts_with_bom(ptr, len)) {
 249                 ptr += 3;
 250                 len -= 3;
 251             }
 252 
 253             const bool tsv = has_tabs(ptr, len);
 254             write_items = tsv ? write_tsv_line : write_ssv_line;
 255             items = write_items(w, ptr, len, SIZE_MAX);
 256             write_byte(w, '\n');
 257             if (live_lines) {
 258                 flush(w);
 259             }
 260             continue;
 261         }
 262 
 263         // write normal data lines
 264         size_t got = write_items(w, ptr, len, items);
 265         // add empty fields, when missing trailing ones
 266         for (size_t j = got; j < items; j++) {
 267             write_byte(w, '\t');
 268         }
 269 
 270         write_byte(w, '\n');
 271         if (live_lines) {
 272             flush(w);
 273         }
 274     }
 275 }
 276 
 277 // handle_file handles data from the filename given; returns false only when
 278 // the file can't be opened
 279 bool handle_file(bufwriter* w, const char* path, slice* line, bool live_lines) {
 280     FILE* f = fopen(path, "rb");
 281     if (f == NULL) {
 282         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 283         return false;
 284     }
 285 
 286     handle_reader(w, f, line, live_lines);
 287     fclose(f);
 288     return true;
 289 }
 290 
 291 // run returns the number of errors
 292 int run(int argc, char** argv, FILE* w, bool live_lines) {
 293     unsigned char outbuf[OBUF_SIZE];
 294     bufwriter bw;
 295 
 296     size_t dashes = 0;
 297     for (int i = 1; i < argc; i++) {
 298         if (strcmp(argv[i], "-") == 0) {
 299             dashes++;
 300         }
 301     }
 302 
 303     if (dashes > 1) {
 304         const char* m = "can't use the standard input (dash) more than once";
 305         fprintf(stderr, ERROR_LINE("%s"), m);
 306         return 1;
 307     }
 308 
 309     slice line;
 310     line.cap = 32 * 1024;
 311     line.ptr = malloc(line.cap);
 312 
 313     if (live_lines) {
 314         if (line.ptr == NULL) {
 315             fprintf(stderr, ERROR_LINE("out of memory"));
 316             exit(BAD_ALLOC);
 317         }
 318     }
 319 
 320     init_bufwriter(&bw, w, outbuf, sizeof(outbuf));
 321 
 322     size_t errors = 0;
 323     for (int i = 1; i < argc && !feof(w); i++) {
 324         if (strcmp(argv[i], "-") == 0) {
 325             handle_reader(&bw, stdin, &line, live_lines);
 326             continue;
 327         }
 328 
 329         if (!handle_file(&bw, argv[i], &line, live_lines)) {
 330             errors++;
 331         }
 332     }
 333 
 334     // use stdin when not given any filepaths
 335     if (argc <= 1) {
 336         handle_reader(&bw, stdin, &line, live_lines);
 337     }
 338 
 339     free(line.ptr);
 340     flush(&bw);
 341     return errors;
 342 }
 343 
 344 int main(int argc, char** argv) {
 345 #ifdef _WIN32
 346     setmode(fileno(stdin), O_BINARY);
 347     // ensure output lines end in LF instead of CRLF on windows
 348     setmode(fileno(stdout), O_BINARY);
 349     setmode(fileno(stderr), O_BINARY);
 350 #endif
 351 
 352     if (argc > 1) {
 353         if (
 354             strcmp(argv[1], "-h") == 0 ||
 355             strcmp(argv[1], "-help") == 0 ||
 356             strcmp(argv[1], "--h") == 0 ||
 357             strcmp(argv[1], "--help") == 0
 358         ) {
 359             fprintf(stdout, "%s", info);
 360             return 0;
 361         }
 362     }
 363 
 364     const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0;
 365     if (!live_lines) {
 366         setvbuf(stdout, NULL, _IOFBF, 0);
 367     }
 368     return run(argc, argv, stdout, live_lines) == 0 ? 0 : 1;
 369 }