File: jsons.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -flto -o ./jsons ./jsons.c
  29 
  30 To use null values for missing trailing cells, you can build it using
  31 
  32 cc -Wall -s -O3 -flto -D NULL_TRAILS -o ./jsons ./jsons.c
  33 */
  34 
  35 #include <stdbool.h>
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 
  40 #ifdef _WIN32
  41 #include <fcntl.h>
  42 #include <windows.h>
  43 #endif
  44 
  45 #ifdef RED_ERRORS
  46 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  47 #ifdef __APPLE__
  48 #define ERROR_STYLE "\x1b[31m"
  49 #endif
  50 #define RESET_STYLE "\x1b[0m"
  51 #else
  52 #define ERROR_STYLE
  53 #define RESET_STYLE
  54 #endif
  55 
  56 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  57 
  58 #define BAD_ALLOC 2
  59 
  60 #ifdef NULL_TRAILS
  61 const char* info = ""
  62 "jsons [filename...]\n"
  63 "\n"
  64 "Turn a TSV table into JSON Strings, which is a JSON array with objects of\n"
  65 "string values. The only non-string values are nulls, which are used for any\n"
  66 "missing trailing fields.\n"
  67 "";
  68 #else
  69 const char* info = ""
  70 "jsons [filename...]\n"
  71 "\n"
  72 "Turn a TSV table into JSON Strings, which is a JSON array with objects of\n"
  73 "string values.\n"
  74 "";
  75 #endif
  76 
  77 // span is a region of bytes in memory
  78 typedef struct span {
  79     // ptr is the starting place of the region
  80     unsigned char* ptr;
  81 
  82     // len is how many bytes are in the region
  83     size_t len;
  84 } span;
  85 
  86 // slice is a growable region of bytes in memory
  87 typedef struct slice {
  88     // ptr is the starting place of the region
  89     unsigned char* ptr;
  90 
  91     // cap is how many bytes the memory region has available
  92     size_t cap;
  93 } slice;
  94 
  95 bool starts_with_bom(const unsigned char* b, const size_t n) {
  96     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
  97 }
  98 
  99 // emit_json_item emits JSON strings for the TSV items given: such items are
 100 // strings without line-feeds, carriage-returns, or tabs
 101 void emit_json_item(FILE* w, const unsigned char* ptr, size_t len) {
 102     fputc('"', w);
 103     for (size_t i = 0; i < len; i++) {
 104         const unsigned char b = ptr[i];
 105         if (b == '"' || b == '\\') {
 106             fputc('\\', w);
 107         }
 108         fputc(b, w);
 109     }
 110     fputc('"', w);
 111 }
 112 
 113 // emit_json_item_str is like function emit_json_item, but taking c-strings
 114 void emit_json_item_str(FILE* w, const char* s) {
 115     fputc('"', w);
 116     for (; *s != 0; s++) {
 117         const unsigned char b = *s;
 118         if (b == '"' || b == '\\') {
 119             fputc('\\', w);
 120         }
 121         fputc(b, w);
 122     }
 123     fputc('"', w);
 124 }
 125 
 126 void handle_row(FILE* w, char** keys, size_t num_keys, span line) {
 127     size_t start = 0;
 128     size_t len = 0;
 129     size_t got = 0;
 130 
 131     fputc(' ', w);
 132     fputc(' ', w);
 133     fputc('{', w);
 134 
 135     for (size_t i = 0; i < line.len; i++) {
 136         if (line.ptr[i] == '\t') {
 137             if (got > 0) {
 138                 fprintf(w, ", ");
 139             }
 140 
 141             emit_json_item_str(w, keys[got]);
 142             fprintf(w, ": ");
 143             emit_json_item(w, line.ptr + start, len);
 144 
 145             start = i + 1;
 146             len = 0;
 147             got++;
 148         } else {
 149             len++;
 150         }
 151     }
 152 
 153     if (start < line.len) {
 154         if (got > 0) {
 155             fprintf(w, ", ");
 156         }
 157 
 158         emit_json_item_str(w, keys[got]);
 159         fprintf(w, ": ");
 160         emit_json_item(w, line.ptr + start, len);
 161         got++;
 162     }
 163 
 164     for (size_t i = got; i < num_keys; i++) {
 165         if (i > 0) {
 166             fprintf(w, ", ");
 167         }
 168 
 169         emit_json_item_str(w, keys[got]);
 170 #ifdef NULL_TRAILS
 171         fprintf(w, ": null");
 172 #else
 173         fprintf(w, ": \"\"");
 174 #endif
 175     }
 176 
 177     fputc('}', w);
 178 }
 179 
 180 void handle_reader(FILE* w, FILE* r, slice* line) {
 181     char** keys = NULL;
 182     size_t num_keys = 0;
 183     size_t i = 0;
 184 
 185     for (i = 0; !feof(w); i++) {
 186         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 187         if (line->ptr == NULL) {
 188             fprintf(stderr, "\n");
 189             fprintf(stderr, ERROR_LINE("out of memory"));
 190             exit(BAD_ALLOC);
 191         }
 192 
 193         if (len < 0) {
 194             break;
 195         }
 196 
 197         unsigned char* ptr = line->ptr;
 198 
 199         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 200         if (i == 0 && starts_with_bom(ptr, len)) {
 201             ptr += 3;
 202             len -= 3;
 203         }
 204 
 205         // get rid of trailing line-feeds and CRLF end-of-line byte-pairs
 206         if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') {
 207             len -= 2;
 208         } else if (len >= 1 && ptr[len - 1] == '\n') {
 209             len--;
 210         }
 211 
 212         if (i == 0) {
 213             for (size_t j = 0; j < len; j++) {
 214                 if (ptr[j] == '\t') {
 215                     num_keys++;
 216                 }
 217             }
 218             num_keys++;
 219 
 220             keys = malloc(sizeof(char*) * num_keys);
 221             if (keys == NULL) {
 222                 fprintf(stderr, "\n");
 223                 fprintf(stderr, ERROR_LINE("out of memory"));
 224                 exit(BAD_ALLOC);
 225             }
 226 
 227             keys[0] = malloc(len + 1);
 228             if (keys[0] == NULL) {
 229                 fprintf(stderr, "\n");
 230                 fprintf(stderr, ERROR_LINE("out of memory"));
 231                 exit(BAD_ALLOC);
 232             }
 233 
 234             char* copy = keys[0];
 235             memcpy(copy, ptr, len);
 236             copy[len] = 0;
 237 
 238             for (size_t j = 0, k = 1; j < len; j++) {
 239                 if (copy[j] == '\t') {
 240                     copy[j] = 0;
 241                     keys[k] = copy + j + 1;
 242                     k++;
 243                 }
 244             }
 245             continue;
 246         }
 247 
 248         if (i == 1) {
 249             fputc('[', w);
 250         } else {
 251             fputc(',', w);
 252         }
 253         fputc('\n', w);
 254 
 255         span s;
 256         s.ptr = ptr;
 257         s.len = len;
 258         handle_row(w, keys, num_keys, s);
 259     }
 260 
 261     if (i > 0) {
 262         fprintf(w, "\n]\n");
 263     } else {
 264         fprintf(w, "[]\n");
 265     }
 266 
 267     if (keys != NULL) {
 268         if (keys[0] != NULL) {
 269             free(keys[0]);
 270         }
 271         free(keys);
 272     }
 273 }
 274 
 275 // handle_file handles data from the filename given; returns false only when
 276 // the file can't be opened
 277 bool handle_file(FILE* w, const char* path, slice* line) {
 278     FILE* f = fopen(path, "rb");
 279     if (f == NULL) {
 280         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 281         return false;
 282     }
 283 
 284     handle_reader(w, f, line);
 285     fclose(f);
 286     return true;
 287 }
 288 
 289 bool run(FILE* w, const char* path) {
 290     slice line;
 291     line.cap = 32 * 1024;
 292     line.ptr = malloc(line.cap);
 293 
 294     if (line.ptr == NULL) {
 295         fprintf(stderr, "\n");
 296         fprintf(stderr, ERROR_LINE("out of memory"));
 297         exit(BAD_ALLOC);
 298     }
 299 
 300     // filename `-` means use the standard input
 301     if (strcmp(path, "-") == 0) {
 302         handle_reader(w, stdin, &line);
 303         free(line.ptr);
 304         return true;
 305     }
 306 
 307     const bool ok = handle_file(w, path, &line);
 308     free(line.ptr);
 309     return ok;
 310 }
 311 
 312 int main(int argc, char** argv) {
 313 #ifdef _WIN32
 314     setmode(fileno(stdin), O_BINARY);
 315     // ensure output lines end in LF instead of CRLF on windows
 316     setmode(fileno(stdout), O_BINARY);
 317     setmode(fileno(stderr), O_BINARY);
 318 #endif
 319 
 320     if (argc > 1) {
 321         if (
 322             strcmp(argv[1], "-h") == 0 ||
 323             strcmp(argv[1], "-help") == 0 ||
 324             strcmp(argv[1], "--h") == 0 ||
 325             strcmp(argv[1], "--help") == 0
 326         ) {
 327             fprintf(stdout, "%s", info);
 328             return 0;
 329         }
 330     }
 331 
 332     if (argc > 2) {
 333         const char* msg = "can't use more than 1 named input";
 334         fprintf(stderr, ERROR_LINE("%s"), msg);
 335         return 1;
 336     }
 337 
 338     return run(stdout, (argc > 1) ? argv[1] : "-") ? 0 : 1;
 339 }