File: jsons.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -flto -o ./jsons ./jsons.c
  29 
  30 To use empty strings for missing trailing cells, you can build it using
  31 
  32 cc -Wall -s -O3 -flto -D PURE_JSONS -o ./jsons ./jsons.c
  33 */
  34 
  35 #include <stdbool.h>
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 
  40 #ifdef _WIN32
  41 #include <fcntl.h>
  42 #include <windows.h>
  43 #endif
  44 
  45 #ifdef RED_ERRORS
  46 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  47 #ifdef __APPLE__
  48 #define ERROR_STYLE "\x1b[31m"
  49 #endif
  50 #define RESET_STYLE "\x1b[0m"
  51 #else
  52 #define ERROR_STYLE
  53 #define RESET_STYLE
  54 #endif
  55 
  56 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  57 
  58 #ifndef PURE_JSONS
  59 #define NULL_TRAILS
  60 #endif
  61 
  62 #ifdef NULL_TRAILS
  63 const char* info = ""
  64 "jsons [filename...]\n"
  65 "\n"
  66 "Turn a TSV table into JSON Strings, which is a JSON array with objects of\n"
  67 "string values. The only non-string values are nulls, which are used for any\n"
  68 "missing trailing fields.\n"
  69 "";
  70 #else
  71 const char* info = ""
  72 "jsons [filename...]\n"
  73 "\n"
  74 "Turn a TSV table into JSON Strings, which is a JSON array with objects of\n"
  75 "string values.\n"
  76 "";
  77 #endif
  78 
  79 // span is a region of bytes in memory
  80 typedef struct span {
  81     // ptr is the starting place of the region
  82     unsigned char* ptr;
  83 
  84     // len is how many bytes are in the region
  85     size_t len;
  86 } span;
  87 
  88 // slice is a growable region of bytes in memory
  89 typedef struct slice {
  90     // ptr is the starting place of the region
  91     unsigned char* ptr;
  92 
  93     // len is how many bytes are currently being used
  94     size_t len;
  95 
  96     // cap is how many bytes the memory region has available
  97     size_t cap;
  98 } slice;
  99 
 100 bool starts_with_bom(const unsigned char* b, const size_t n) {
 101     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 102 }
 103 
 104 // emit_json_item emits JSON strings for the TSV items given: such items are
 105 // strings without line-feeds, carriage-returns, or tabs
 106 void emit_json_item(FILE* w, const unsigned char* ptr, size_t len) {
 107     fputc('"', w);
 108     for (size_t i = 0; i < len; i++) {
 109         const unsigned char b = ptr[i];
 110         if (b == '"' || b == '\\') {
 111             fputc('\\', w);
 112         }
 113         fputc(b, w);
 114     }
 115     fputc('"', w);
 116 }
 117 
 118 // emit_json_item_str is like function emit_json_item, but taking c-strings
 119 void emit_json_item_str(FILE* w, const char* s) {
 120     fputc('"', w);
 121     for (; *s != 0; s++) {
 122         const unsigned char b = *s;
 123         if (b == '"' || b == '\\') {
 124             fputc('\\', w);
 125         }
 126         fputc(b, w);
 127     }
 128     fputc('"', w);
 129 }
 130 
 131 void handle_row(FILE* w, char** keys, size_t num_keys, span line) {
 132     size_t start = 0;
 133     size_t len = 0;
 134     size_t got = 0;
 135 
 136     fputc(' ', w);
 137     fputc(' ', w);
 138     fputc('{', w);
 139 
 140     for (size_t i = 0; i < line.len; i++) {
 141         if (line.ptr[i] == '\t') {
 142             if (got > 0) {
 143                 fprintf(w, ", ");
 144             }
 145 
 146             emit_json_item_str(w, keys[got]);
 147             fprintf(w, ": ");
 148             emit_json_item(w, line.ptr + start, len);
 149 
 150             start = i + 1;
 151             len = 0;
 152             got++;
 153         } else {
 154             len++;
 155         }
 156     }
 157 
 158     if (start < line.len) {
 159         if (got > 0) {
 160             fprintf(w, ", ");
 161         }
 162 
 163         emit_json_item_str(w, keys[got]);
 164         fprintf(w, ": ");
 165         emit_json_item(w, line.ptr + start, len);
 166         got++;
 167     }
 168 
 169     for (size_t i = got; i < num_keys; i++) {
 170         if (i > 0) {
 171             fprintf(w, ", ");
 172         }
 173 
 174         emit_json_item_str(w, keys[got]);
 175 #ifdef NULL_TRAILS
 176         fprintf(w, ": null");
 177 #else
 178         fprintf(w, ": \"\"");
 179 #endif
 180     }
 181 
 182     fputc('}', w);
 183 }
 184 
 185 // handle_reader skips leading UTF-8 BOMs (byte-order marks), and turns all
 186 // CR-LF pairs into single LF bytes
 187 bool handle_reader(FILE* w, FILE* r, slice* line) {
 188     span trimmed;
 189     char** keys = NULL;
 190     size_t num_keys = 0;
 191     size_t i = 0;
 192 
 193     for (i = 0; !feof(w); i++) {
 194         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 195         if (len < 0) {
 196             break;
 197         }
 198 
 199         if (line->ptr == NULL) {
 200             fprintf(stderr, ERROR_LINE("out of memory"));
 201             exit(1);
 202         }
 203 
 204         line->len = len;
 205         trimmed.ptr = line->ptr;
 206         trimmed.len = line->len;
 207 
 208         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 209         if (i == 0 && starts_with_bom(trimmed.ptr, trimmed.len)) {
 210             trimmed.ptr += 3;
 211             trimmed.len -= 3;
 212             len = trimmed.len;
 213         }
 214 
 215         const unsigned char* p = trimmed.ptr;
 216         // get rid of trailing line-feeds and CRLF end-of-line byte-pairs
 217         if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 218             trimmed.len -= 2;
 219         } else if (len >= 1 && p[len - 1] == '\n') {
 220             trimmed.len--;
 221         }
 222 
 223         if (i == 0) {
 224             for (size_t j = 0; j < trimmed.len; j++) {
 225                 if (trimmed.ptr[j] == '\t') {
 226                     num_keys++;
 227                 }
 228             }
 229             num_keys++;
 230 
 231             keys = malloc(sizeof(char*) * num_keys);
 232             if (keys == NULL) {
 233                 fprintf(stderr, ERROR_LINE("out of memory"));
 234                 exit(1);
 235             }
 236 
 237             keys[0] = malloc(trimmed.len + 1);
 238             if (keys[0] == NULL) {
 239                 fprintf(stderr, ERROR_LINE("out of memory"));
 240                 exit(1);
 241             }
 242 
 243             char* copy = keys[0];
 244             memcpy(copy, trimmed.ptr, trimmed.len);
 245             copy[trimmed.len] = 0;
 246 
 247             for (size_t j = 0, k = 1; j < trimmed.len; j++) {
 248                 if (copy[j] == '\t') {
 249                     copy[j] = 0;
 250                     keys[k] = copy + j + 1;
 251                     k++;
 252                 }
 253             }
 254             continue;
 255         }
 256 
 257         if (i == 1) {
 258             fputc('[', w);
 259         } else {
 260             fputc(',', w);
 261         }
 262         fputc('\n', w);
 263 
 264         handle_row(w, keys, num_keys, trimmed);
 265     }
 266 
 267     if (i > 0) {
 268         fprintf(w, "\n]\n");
 269     } else {
 270         fprintf(w, "[]\n");
 271     }
 272 
 273     if (keys != NULL) {
 274         if (keys[0] != NULL) {
 275             free(keys[0]);
 276         }
 277         free(keys);
 278     }
 279 
 280     return true;
 281 }
 282 
 283 // handle_file handles data from the filename given; returns false only when
 284 // the file can't be opened
 285 bool handle_file(FILE* w, const char* path, slice* line) {
 286     FILE* f = fopen(path, "rb");
 287     if (f == NULL) {
 288         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 289         return false;
 290     }
 291 
 292     const bool ok = handle_reader(w, f, line);
 293     fclose(f);
 294     return ok;
 295 }
 296 
 297 bool run(FILE* w, const char* path) {
 298     slice line;
 299     line.len = 0;
 300     line.cap = 32 * 1024;
 301     line.ptr = malloc(line.cap);
 302 
 303     if (line.ptr == NULL) {
 304         fprintf(stderr, ERROR_LINE("out of memory"));
 305         return false;
 306     }
 307 
 308     // filename `-` means use the standard input
 309     if (path[0] == '-' && path[1] == 0) {
 310         const bool ok = handle_reader(w, stdin, &line);
 311         free(line.ptr);
 312         return ok;
 313     }
 314 
 315     const bool ok = handle_file(w, path, &line);
 316     free(line.ptr);
 317     return ok;
 318 }
 319 
 320 int main(int argc, char** argv) {
 321 #ifdef _WIN32
 322     setmode(fileno(stdin), O_BINARY);
 323     // ensure output lines end in LF instead of CRLF on windows
 324     setmode(fileno(stdout), O_BINARY);
 325     setmode(fileno(stderr), O_BINARY);
 326 #endif
 327 
 328     if (argc > 1) {
 329         if (
 330             strcmp(argv[1], "-h") == 0 ||
 331             strcmp(argv[1], "-help") == 0 ||
 332             strcmp(argv[1], "--h") == 0 ||
 333             strcmp(argv[1], "--help") == 0
 334         ) {
 335             fprintf(stdout, "%s", info);
 336             return 0;
 337         }
 338     }
 339 
 340     if (argc > 2) {
 341         const char* msg = "can't use more than 1 named input";
 342         fprintf(stderr, ERROR_LINE("%s"), msg);
 343         return 1;
 344     }
 345 
 346     return run(stdout, (argc > 1) ? argv[1] : "-") ? 0 : 1;
 347 }