File: coby.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2024 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 coby [files...]
  27 
  28 
  29 COunt BYtes finds out some simple byte-related stats, counting
  30 
  31     - bytes
  32     - runes, which are UTF-8 code-points
  33     - lines
  34     - how many lines have trailing spaces
  35     - how many lines end with a CRLF pair
  36     - all-off (0) bytes
  37     - all-on (255) bytes
  38     - high-bytes (128+)
  39     - which (if any) byte-order mark the data start with
  40 
  41 The output is TSV (tab-separated values) lines, where the first line has
  42 all the column names.
  43 
  44 When no filepaths are given, the standard input is used by default.
  45 */
  46 
  47 /*
  48 You can build this command-line app by running
  49     cc -Wall -s -O2 -o ./coby ./coby.c
  50 */
  51 
  52 #include <fcntl.h>
  53 #include <stdbool.h>
  54 #include <stdint.h>
  55 #include <stdio.h>
  56 #include <stdlib.h>
  57 #include <string.h>
  58 
  59 #ifdef _WIN32
  60 #include <windows.h>
  61 #endif
  62 
  63 const char* header = ""
  64     "name\tbytes\trunes\tlines\tlf\tcrlf\tspaces\ttabs"
  65     "\ttrails\tnulls\tfulls\thighs\tbom";
  66 
  67 enum {
  68     no_bom = 0,
  69     utf8_bom = 1,
  70     utf16le_bom = 2,
  71     utf16be_bom = 3,
  72     utf32le_bom = 4,
  73     utf32be_bom = 5,
  74 };
  75 
  76 const char* bom_legend[] = {
  77     "",
  78     "UTF-8",
  79     "UTF-16 LE",
  80     "UTF-16 BE",
  81     "UTF-32 LE",
  82     "UTF-32 BE",
  83 };
  84 
  85 // stats holds all byte-related counts this app deals with
  86 typedef struct stats {
  87     uint64_t bytes;  // the total byte-count
  88     uint64_t runes;  // how many utf-8 items
  89     uint64_t lines;  // how many plain-text lines
  90     uint64_t lf;     // how many line-feeds
  91     uint64_t crlf;   // how many carriage-return/line-feed pairs
  92     uint64_t spaces; // how many spaces
  93     uint64_t tabs;   // how many tabs
  94     uint64_t trails; // how many plain-text lines with trailing spaces
  95     uint64_t nulls;  // how many all-bits-off bytes
  96     uint64_t fulls;  // how many all-bits-on bytes
  97     uint64_t highs;  // how many bytes with the highest-order bit on
  98     uint64_t bom;    // which (if any) kind of byte-order mark data start with
  99 } stats;
 100 
 101 uint64_t check_bom(unsigned char* data, size_t len) {
 102     const unsigned char* d = data;
 103 
 104     if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
 105         return utf8_bom;
 106     }
 107     if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) {
 108         return utf32le_bom;
 109     }
 110     if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) {
 111         return utf32be_bom;
 112     }
 113     if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) {
 114         return utf16le_bom;
 115     }
 116     if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) {
 117         return utf16be_bom;
 118     }
 119 
 120     return no_bom;
 121 }
 122 
 123 // count_bytes gathers all sorts of byte-related stats, besides a total count
 124 void count_bytes(FILE* r, stats* res) {
 125     unsigned char buf[48 * 1024];
 126     uint64_t tally[256];
 127 
 128     uint64_t bytes = 0;
 129     uint64_t crlf = 0;
 130     uint64_t trails = 0;
 131     uint64_t runes = 0;
 132 
 133     unsigned char prev2 = 0;
 134     unsigned char prev1 = 0;
 135 
 136     memset(tally, 0, sizeof(tally));
 137     memset(res, 0, sizeof(stats));
 138 
 139     while (true) {
 140         const size_t len = fread(buf, sizeof(unsigned char), sizeof(buf), r);
 141 
 142         if (len < 1) {
 143             break;
 144         }
 145 
 146         if (bytes == 0) {
 147             res->bom = check_bom(buf, len);
 148         }
 149         bytes += len;
 150 
 151         for (size_t i = 0; i < len; i++) {
 152             const unsigned char cur = buf[i];
 153             tally[cur]++;
 154 
 155             crlf += (prev1 == '\r') && (cur == '\n');
 156             trails += (cur == '\n') &&
 157                 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r'));
 158             runes += (cur & 0xc0) != 0x80;
 159 
 160             prev2 = prev1;
 161             prev1 = cur;
 162         }
 163     }
 164 
 165     res->bytes = bytes;
 166     res->crlf = crlf;
 167     res->trails = trails;
 168     res->runes = runes;
 169 
 170     res->lines = tally['\n'];
 171     res->lf = tally['\n'];
 172     res->spaces = tally[' '];
 173     res->tabs = tally['\t'];
 174     res->nulls = tally[0];
 175     res->fulls = tally[255];
 176 
 177     res->highs = 0;
 178     for (size_t i = 128; i < 256; i++) {
 179         res->highs += tally[i];
 180     }
 181 
 182     // count last line for non-empty inputs not ending with a line-feed byte
 183     if (res->bytes > 0 && prev1 != '\n') {
 184         res->lines++;
 185     }
 186 
 187     // count last trail for inputs not ending with a line-feed byte
 188     if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) {
 189         res->trails++;
 190     }
 191 }
 192 
 193 // handle_input gathers stats, and shows a TSV line out of the results
 194 void handle_input(FILE* r, stats* res, char* name) {
 195     // show the filename right away, to reassure users something's happening
 196     printf("%s", name);
 197     fflush(stdout);
 198 
 199     count_bytes(r, res);
 200 
 201     // show results as soon as they're available
 202     printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t",
 203         res->bytes, res->runes, res->lines, res->lf, res->crlf, res->spaces,
 204         res->tabs,res->trails, res->nulls, res->fulls, res->highs);
 205     printf("%s\n", bom_legend[res->bom]);
 206     fflush(stdout);
 207 }
 208 
 209 // handle_file handles data from the filename given, and returns whether the
 210 // file was opened successfully
 211 bool handle_file(char* fname, stats* res) {
 212     FILE* f = fopen(fname, "rb");
 213     if (f == NULL) {
 214         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname);
 215         return false;
 216     }
 217 
 218     handle_input(f, res, fname);
 219     fclose(f);
 220     return true;
 221 }
 222 
 223 // run returns the number of errors
 224 size_t run(int argc, char** argv) {
 225     size_t empty = 0;
 226     size_t dashes = 0;
 227 
 228     for (int i = 1; i < argc; i++) {
 229         if (argv[i][0] == 0) {
 230             empty++;
 231             continue;
 232         }
 233 
 234         if (argv[i][0] == '-' && argv[i][1] == 0) {
 235             dashes++;
 236         }
 237     }
 238 
 239     if (dashes > 1) {
 240         const char* msg = "can't use a dash (stdin) as input more than once";
 241         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 242         return 1;
 243     }
 244 
 245     // show header line right away, to reassure users the app is working
 246     printf("%s\n", header);
 247     fflush(stdout);
 248 
 249     // if output is done, don't even bother doing anything
 250     if (feof(stdout)) {
 251         return 0;
 252     }
 253 
 254     // use stdin when not given any filepaths, or when all paths are empty
 255     if (argc <= 1 || empty == argc - 1) {
 256         stats res;
 257         handle_input(stdin, &res, "-");
 258         return 0;
 259     }
 260 
 261     stats res;
 262     size_t errors = 0;
 263 
 264     for (int i = 1; i < argc; i++) {
 265         // if output is done while being piped, quit right away
 266         if (feof(stdout)) {
 267             return errors;
 268         }
 269 
 270         // ignore empty names
 271         if (argv[i][0] == 0) {
 272             continue;
 273         }
 274 
 275         // handle `-` as stdin
 276         if (argv[i][0] == '-' && argv[i][1] == 0) {
 277             handle_input(stdin, &res, argv[i]);
 278             continue;
 279         }
 280 
 281         errors += !handle_file(argv[i], &res);
 282     }
 283 
 284     return errors;
 285 }
 286 
 287 int main(int argc, char** argv) {
 288 #ifdef _WIN32
 289     setmode(fileno(stdin), O_BINARY);
 290     // ensure output lines end in LF instead of CRLF on windows
 291     setmode(fileno(stdout), O_BINARY);
 292     setmode(fileno(stderr), O_BINARY);
 293 #endif
 294 
 295     // disable automatic stdio buffering, in favor of explicit buffering
 296     setvbuf(stdin, NULL, _IONBF, 0);
 297     setvbuf(stdout, NULL, _IONBF, 0);
 298     setvbuf(stderr, NULL, _IONBF, 0);
 299 
 300     return run(argc, argv) == 0 ? 0 : 1;
 301 }