File: dessv.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./dessv ./dessv.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <unistd.h> 37 38 #ifdef _WIN32 39 #include <fcntl.h> 40 #include <windows.h> 41 #endif 42 43 #ifdef RED_ERRORS 44 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 45 #ifdef __APPLE__ 46 #define ERROR_STYLE "\x1b[31m" 47 #endif 48 #define RESET_STYLE "\x1b[0m" 49 #else 50 #define ERROR_STYLE 51 #define RESET_STYLE 52 #endif 53 54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 55 56 #define BAD_ALLOC 2 57 58 #ifndef OBUF_SIZE 59 #define OBUF_SIZE (8 * 1024) 60 #endif 61 62 const char* info = "" 63 "dessv [filenames...]\n" 64 "\n" 65 "Turn Space(s)-Separated Values (SSV) into Tab-Separated Values (TSV), where\n" 66 "both leading and trailing spaces from input lines are ignored.\n" 67 ""; 68 69 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at 70 // stdout, it considerably speeds up this app, as intended 71 typedef struct bufwriter { 72 // buf is the buffer proper 73 unsigned char* buf; 74 75 // len is how many bytes of the buffer are currently being used 76 size_t len; 77 78 // cap is the capacity of the buffer, or the most bytes it can hold 79 size_t cap; 80 81 // out is the destination of all that's written into the buffer 82 FILE* out; 83 } bufwriter; 84 85 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) { 86 w->buf = b; 87 w->len = 0; 88 w->cap = cap; 89 w->out = out; 90 } 91 92 void write_byte(bufwriter* w, unsigned char b) { 93 if (w->len < w->cap) { 94 w->buf[w->len++] = b; 95 return; 96 } 97 98 fwrite(w->buf, 1, w->cap, w->out); 99 w->buf[0] = b; 100 w->len = 1; 101 } 102 103 // write_bytes does as it says, minimizing the number of calls to fwrite 104 void write_bytes(bufwriter* w, const unsigned char* src, size_t len) { 105 const size_t rem = w->cap - w->len; 106 if (len < rem) { 107 memcpy(w->buf + w->len, src, len); 108 w->len += len; 109 return; 110 } 111 112 for (size_t i = 0; i < len; i++) { 113 write_byte(w, src[i]); 114 } 115 } 116 117 void flush(bufwriter* w) { 118 if (w->len > 0) { 119 fwrite(w->buf, 1, w->len, w->out); 120 } 121 w->len = 0; 122 fflush(w->out); 123 } 124 125 // slice is a growable region of bytes in memory 126 typedef struct slice { 127 // ptr is the starting place of the region 128 unsigned char* ptr; 129 130 // cap is how many bytes the memory region has available 131 size_t cap; 132 } slice; 133 134 bool starts_with_bom(const unsigned char* b, const size_t n) { 135 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 136 } 137 138 bool has_tabs(const unsigned char* b, const size_t n) { 139 for (size_t i = 0; i < n; i++) { 140 if (b[i] == '\t') { 141 return true; 142 } 143 } 144 return false; 145 } 146 147 size_t count_tabs(const unsigned char* b, const size_t n) { 148 size_t tabs = 0; 149 for (size_t i = 0; i < n; i++) { 150 tabs += (b[i] == '\t'); 151 } 152 return tabs; 153 } 154 155 // write_tsv_line returns the number of tab-separated values emitted; current 156 // line isn't ended with a line-feed, which must be emitted separately 157 size_t write_tsv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) { 158 // ignore trailing CRLF byte-pairs, or trailing line-feed bytes 159 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 160 len -= 2; 161 } else if (len >= 1 && p[len - 1] == '\n') { 162 len--; 163 } 164 165 write_bytes(w, p, len); 166 return count_tabs(p, len) + 1; 167 } 168 169 // write_ssv_line returns the number of tab-separated values emitted; current 170 // line isn't ended with a line-feed, which must be emitted separately 171 size_t write_ssv_line(bufwriter* w, const unsigned char* p, size_t len, size_t width) { 172 // ignore trailing CRLF byte-pairs, or trailing line-feed bytes 173 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 174 len -= 2; 175 } else if (len >= 1 && p[len - 1] == '\n') { 176 len--; 177 } 178 179 // ignore leading spaces 180 while (len > 0 && p[0] == ' ') { 181 p++; 182 len--; 183 } 184 185 // trailing spaces are inconsequential, since there's nothing to follow 186 // them, which in turn prevents their normal single-tab substitutes from 187 // being emitted 188 189 size_t items = 0; 190 bool space = false; 191 unsigned char sep = '\t'; 192 193 for (size_t i = 0; i < len; i++) { 194 unsigned char b = p[i]; 195 196 if (b == ' ') { 197 space = true; 198 continue; 199 } 200 201 if (items == width) { 202 sep = ' '; 203 } 204 205 if (space) { 206 write_byte(w, sep); 207 items++; 208 space = false; 209 } 210 write_byte(w, b); 211 } 212 213 return items; 214 } 215 216 void handle_reader(bufwriter* w, FILE* r, slice* line, bool live_lines) { 217 size_t (*write_items)(bufwriter*, const unsigned char*, size_t, size_t) = NULL; 218 size_t items = 0; 219 220 for (size_t i = 0; !feof(w->out); i++) { 221 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 222 if (len < 0) { 223 break; 224 } 225 226 if (line->ptr == NULL) { 227 fprintf(stderr, "\n"); 228 fprintf(stderr, ERROR_LINE("out of memory")); 229 exit(BAD_ALLOC); 230 } 231 232 unsigned char* ptr = line->ptr; 233 234 // turn trailing carriage-returns into line-feeds 235 if (len >= 1 && ptr[len - 1] == '\r') { 236 ptr[len - 1] = '\n'; 237 } 238 239 // get rid of carriage-returns preceding line-feeds 240 if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') { 241 ptr[len - 2] = '\n'; 242 len--; 243 } 244 245 // 1st line: figure out if lines are already TSV, remember item-count, 246 // and ignore UTF-8 byte-order marks 247 if (i == 0) { 248 if (starts_with_bom(ptr, len)) { 249 ptr += 3; 250 len -= 3; 251 } 252 253 const bool tsv = has_tabs(ptr, len); 254 write_items = tsv ? write_tsv_line : write_ssv_line; 255 items = write_items(w, ptr, len, SIZE_MAX); 256 write_byte(w, '\n'); 257 if (live_lines) { 258 flush(w); 259 } 260 continue; 261 } 262 263 // write normal data lines 264 size_t got = write_items(w, ptr, len, items); 265 // add empty fields, when missing trailing ones 266 for (size_t j = got; j < items; j++) { 267 write_byte(w, '\t'); 268 } 269 270 write_byte(w, '\n'); 271 if (live_lines) { 272 flush(w); 273 } 274 } 275 } 276 277 // handle_file handles data from the filename given; returns false only when 278 // the file can't be opened 279 bool handle_file(bufwriter* w, const char* path, slice* line, bool live_lines) { 280 FILE* f = fopen(path, "rb"); 281 if (f == NULL) { 282 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 283 return false; 284 } 285 286 handle_reader(w, f, line, live_lines); 287 fclose(f); 288 return true; 289 } 290 291 // run returns the number of errors 292 int run(int argc, char** argv, FILE* w, bool live_lines) { 293 unsigned char outbuf[OBUF_SIZE]; 294 bufwriter bw; 295 296 size_t dashes = 0; 297 for (int i = 1; i < argc; i++) { 298 if (strcmp(argv[i], "-") == 0) { 299 dashes++; 300 } 301 } 302 303 if (dashes > 1) { 304 const char* m = "can't use the standard input (dash) more than once"; 305 fprintf(stderr, ERROR_LINE("%s"), m); 306 return 1; 307 } 308 309 slice line; 310 line.cap = 32 * 1024; 311 line.ptr = malloc(line.cap); 312 313 if (live_lines) { 314 if (line.ptr == NULL) { 315 fprintf(stderr, ERROR_LINE("out of memory")); 316 exit(BAD_ALLOC); 317 } 318 } 319 320 init_bufwriter(&bw, w, outbuf, sizeof(outbuf)); 321 322 size_t errors = 0; 323 for (int i = 1; i < argc && !feof(w); i++) { 324 if (strcmp(argv[i], "-") == 0) { 325 handle_reader(&bw, stdin, &line, live_lines); 326 continue; 327 } 328 329 if (!handle_file(&bw, argv[i], &line, live_lines)) { 330 errors++; 331 } 332 } 333 334 // use stdin when not given any filepaths 335 if (argc <= 1) { 336 handle_reader(&bw, stdin, &line, live_lines); 337 } 338 339 free(line.ptr); 340 flush(&bw); 341 return errors; 342 } 343 344 int main(int argc, char** argv) { 345 #ifdef _WIN32 346 setmode(fileno(stdin), O_BINARY); 347 // ensure output lines end in LF instead of CRLF on windows 348 setmode(fileno(stdout), O_BINARY); 349 setmode(fileno(stderr), O_BINARY); 350 #endif 351 352 if (argc > 1) { 353 if ( 354 strcmp(argv[1], "-h") == 0 || 355 strcmp(argv[1], "-help") == 0 || 356 strcmp(argv[1], "--h") == 0 || 357 strcmp(argv[1], "--help") == 0 358 ) { 359 fprintf(stdout, "%s", info); 360 return 0; 361 } 362 } 363 364 const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0; 365 if (!live_lines) { 366 setvbuf(stdout, NULL, _IOFBF, 0); 367 } 368 return run(argc, argv, stdout, live_lines) == 0 ? 0 : 1; 369 }