File: jsons.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O2 -o ./jsons ./jsons.c
  29 
  30 To use empty strings for missing trailing cells, you can build it using
  31 
  32 cc -Wall -s -O2 -D PURE_JSONS -o ./jsons ./jsons.c
  33 */
  34 
  35 #include <stdbool.h>
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 
  40 #ifdef _WIN32
  41 #include <fcntl.h>
  42 #include <windows.h>
  43 #endif
  44 
  45 #ifndef PURE_JSONS
  46 #define NULL_TRAILS
  47 #endif
  48 
  49 #ifdef NULL_TRAILS
  50 const char* info = ""
  51 "jsons [filename...]\n"
  52 "\n"
  53 "Turn a TSV table into JSON Strings, which is a JSON array with objects of\n"
  54 "string values. The only non-string values are nulls, which are used for any\n"
  55 "missing trailing fields.\n"
  56 "";
  57 #else
  58 const char* info = ""
  59 "jsons [filename...]\n"
  60 "\n"
  61 "Turn a TSV table into JSON Strings, which is a JSON array with objects of\n"
  62 "string values.\n"
  63 "";
  64 #endif
  65 
  66 const char* no_line_memory_msg = "can't get enough memory to read lines";
  67 
  68 // span is a region of bytes in memory
  69 typedef struct span {
  70     // ptr is the starting place of the region
  71     unsigned char* ptr;
  72 
  73     // len is how many bytes are in the region
  74     size_t len;
  75 } span;
  76 
  77 // slice is a growable region of bytes in memory
  78 typedef struct slice {
  79     // ptr is the starting place of the region
  80     unsigned char* ptr;
  81 
  82     // len is how many bytes are currently being used
  83     size_t len;
  84 
  85     // cap is how many bytes the memory region has available
  86     size_t cap;
  87 } slice;
  88 
  89 bool starts_with_bom(const unsigned char* b, const size_t n) {
  90     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
  91 }
  92 
  93 // emit_json_item emits JSON strings for the TSV items given: such items are
  94 // strings without line-feeds, carriage-returns, or tabs
  95 void emit_json_item(FILE* w, const unsigned char* ptr, size_t len) {
  96     putc('"', w);
  97     for (size_t i = 0; i < len; i++) {
  98         const unsigned char b = ptr[i];
  99         if (b == '"' || b == '\\') {
 100             putc('\\', w);
 101         }
 102         putc(b, w);
 103     }
 104     putc('"', w);
 105 }
 106 
 107 // emit_json_item_str is like function emit_json_item, but taking c-strings
 108 void emit_json_item_str(FILE* w, const char* s) {
 109     putc('"', w);
 110     for (; *s != 0; s++) {
 111         const unsigned char b = *s;
 112         if (b == '"' || b == '\\') {
 113             putc('\\', w);
 114         }
 115         putc(b, w);
 116     }
 117     putc('"', w);
 118 }
 119 
 120 void handle_row(FILE* w, char** keys, size_t num_keys, span line) {
 121     size_t start = 0;
 122     size_t len = 0;
 123     size_t got = 0;
 124 
 125     putc(' ', w);
 126     putc(' ', w);
 127     putc('{', w);
 128 
 129     for (size_t i = 0; i < line.len; i++) {
 130         if (line.ptr[i] == '\t') {
 131             if (got > 0) {
 132                 fprintf(w, ", ");
 133             }
 134 
 135             emit_json_item_str(w, keys[got]);
 136             fprintf(w, ": ");
 137             emit_json_item(w, line.ptr + start, len);
 138 
 139             start = i + 1;
 140             len = 0;
 141             got++;
 142         } else {
 143             len++;
 144         }
 145     }
 146 
 147     if (start < line.len) {
 148         if (got > 0) {
 149             fprintf(w, ", ");
 150         }
 151 
 152         emit_json_item_str(w, keys[got]);
 153         fprintf(w, ": ");
 154         emit_json_item(w, line.ptr + start, len);
 155         got++;
 156     }
 157 
 158     for (size_t i = got; i < num_keys; i++) {
 159         if (i > 0) {
 160             fprintf(w, ", ");
 161         }
 162 
 163         emit_json_item_str(w, keys[got]);
 164 #ifdef NULL_TRAILS
 165         fprintf(w, ": null");
 166 #else
 167         fprintf(w, ": \"\"");
 168 #endif
 169     }
 170 
 171     putc('}', w);
 172 }
 173 
 174 void show_error(FILE* w, const char* msg) {
 175     putc('\n', w);
 176     fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 177 }
 178 
 179 // handle_reader skips leading UTF-8 BOMs (byte-order marks), and turns all
 180 // CR-LF pairs into single LF bytes
 181 bool handle_reader(FILE* w, FILE* r, slice* line) {
 182     span trimmed;
 183     char** keys = NULL;
 184     size_t num_keys = 0;
 185     size_t i = 0;
 186 
 187     for (i = 0; !feof(w); i++) {
 188         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 189         if (len < 0) {
 190             break;
 191         }
 192 
 193         if (line->ptr == NULL) {
 194             show_error(w, no_line_memory_msg);
 195             exit(1);
 196         }
 197 
 198         line->len = len;
 199         trimmed.ptr = line->ptr;
 200         trimmed.len = line->len;
 201 
 202         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 203         if (i == 0 && starts_with_bom(trimmed.ptr, trimmed.len)) {
 204             trimmed.ptr += 3;
 205             trimmed.len -= 3;
 206             len = trimmed.len;
 207         }
 208 
 209         const unsigned char* p = trimmed.ptr;
 210         // get rid of trailing line-feeds and CRLF end-of-line byte-pairs
 211         if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 212             trimmed.len -= 2;
 213         } else if (len >= 1 && p[len - 1] == '\n') {
 214             trimmed.len--;
 215         }
 216 
 217         if (i == 0) {
 218             for (size_t j = 0; j < trimmed.len; j++) {
 219                 if (trimmed.ptr[j] == '\t') {
 220                     num_keys++;
 221                 }
 222             }
 223             num_keys++;
 224 
 225             keys = malloc(sizeof(char*) * num_keys);
 226             if (keys == NULL) {
 227                 show_error(w, no_line_memory_msg);
 228                 exit(1);
 229             }
 230 
 231             keys[0] = malloc(trimmed.len + 1);
 232             if (keys[0] == NULL) {
 233                 show_error(w, no_line_memory_msg);
 234                 exit(1);
 235             }
 236 
 237             char* copy = keys[0];
 238             memcpy(copy, trimmed.ptr, trimmed.len);
 239             copy[trimmed.len] = 0;
 240 
 241             for (size_t j = 0, k = 1; j < trimmed.len; j++) {
 242                 if (copy[j] == '\t') {
 243                     copy[j] = 0;
 244                     keys[k] = copy + j + 1;
 245                     k++;
 246                 }
 247             }
 248             continue;
 249         }
 250 
 251         if (i == 1) {
 252             putc('[', w);
 253         } else {
 254             putc(',', w);
 255         }
 256         putc('\n', w);
 257         fflush(w);
 258 
 259         handle_row(w, keys, num_keys, trimmed);
 260     }
 261 
 262     if (i > 0) {
 263         fprintf(w, "\n]\n");
 264     } else {
 265         fprintf(w, "[]\n");
 266     }
 267 
 268     if (keys != NULL) {
 269         if (keys[0] != NULL) {
 270             free(keys[0]);
 271         }
 272         free(keys);
 273     }
 274 
 275     fflush(w);
 276     return true;
 277 }
 278 
 279 // handle_file handles data from the filename given; returns false only when
 280 // the file can't be opened
 281 bool handle_file(FILE* w, const char* fname, slice* line) {
 282     FILE* f = fopen(fname, "rb");
 283     if (f == NULL) {
 284         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname);
 285         return false;
 286     }
 287 
 288     const bool ok = handle_reader(w, f, line);
 289     fclose(f);
 290     return ok;
 291 }
 292 
 293 bool run(FILE* w, const char* fname) {
 294     slice line;
 295     line.len = 0;
 296     line.cap = 32 * 1024;
 297     line.ptr = malloc(line.cap);
 298 
 299     if (line.ptr == NULL) {
 300         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_line_memory_msg);
 301         return false;
 302     }
 303 
 304     // filename `-` means use the standard input
 305     if (fname[0] == '-' && fname[1] == 0) {
 306         const bool ok = handle_reader(w, stdin, &line);
 307         free(line.ptr);
 308         return ok;
 309     }
 310 
 311     const bool ok = handle_file(w, fname, &line);
 312     free(line.ptr);
 313     return ok;
 314 }
 315 
 316 int main(int argc, char** argv) {
 317 #ifdef _WIN32
 318     setmode(fileno(stdin), O_BINARY);
 319     // ensure output lines end in LF instead of CRLF on windows
 320     setmode(fileno(stdout), O_BINARY);
 321     setmode(fileno(stderr), O_BINARY);
 322 #endif
 323 
 324     if (argc > 1) {
 325         if (
 326             strcmp(argv[1], "-h") == 0 ||
 327             strcmp(argv[1], "-help") == 0 ||
 328             strcmp(argv[1], "--h") == 0 ||
 329             strcmp(argv[1], "--help") == 0
 330         ) {
 331             fprintf(stdout, "%s", info);
 332             return 0;
 333         }
 334     }
 335 
 336     if (argc > 2) {
 337         const char* msg = "can't use more than 1 named input";
 338         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 339         return 1;
 340     }
 341 
 342     return run(stdout, (argc > 1) ? argv[1] : "-") ? 0 : 1;
 343 }