File: coby.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2024 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 coby [files...]
  27 
  28 
  29 COunt BYtes finds out some simple byte-related stats, counting
  30 
  31     - bytes
  32     - runes, which are UTF-8 code-points
  33     - lines
  34     - how many lines have trailing spaces
  35     - how many lines end with a CRLF pair
  36     - all-off (0) bytes
  37     - all-on (255) bytes
  38     - high-bytes (128+)
  39     - which (if any) byte-order mark the data start with
  40 
  41 The output is TSV (tab-separated values) lines, where the first line has
  42 all the column names.
  43 
  44 When no filepaths are given, the standard input is used by default.
  45 */
  46 
  47 /*
  48 You can build this command-line app by running
  49 
  50 cc -Wall -s -O2 -o ./coby ./coby.c
  51 */
  52 
  53 #include <fcntl.h>
  54 #include <stdbool.h>
  55 #include <stdint.h>
  56 #include <stdio.h>
  57 #include <stdlib.h>
  58 #include <string.h>
  59 
  60 #ifdef _WIN32
  61 #include <windows.h>
  62 #endif
  63 
  64 const char* header = ""
  65     "name\tbytes\trunes\tlines\tlf\tcrlf\tspaces\ttabs"
  66     "\ttrails\tnulls\tfulls\thighs\tbom";
  67 
  68 enum {
  69     no_bom = 0,
  70     utf8_bom = 1,
  71     utf16le_bom = 2,
  72     utf16be_bom = 3,
  73     utf32le_bom = 4,
  74     utf32be_bom = 5,
  75 };
  76 
  77 const char* bom_legend[] = {
  78     "",
  79     "UTF-8",
  80     "UTF-16 LE",
  81     "UTF-16 BE",
  82     "UTF-32 LE",
  83     "UTF-32 BE",
  84 };
  85 
  86 // stats holds all byte-related counts this app deals with
  87 typedef struct stats {
  88     uint64_t bytes;  // the total byte-count
  89     uint64_t runes;  // how many utf-8 items
  90     uint64_t lines;  // how many plain-text lines
  91     uint64_t lf;     // how many line-feeds
  92     uint64_t crlf;   // how many carriage-return/line-feed pairs
  93     uint64_t spaces; // how many spaces
  94     uint64_t tabs;   // how many tabs
  95     uint64_t trails; // how many plain-text lines with trailing spaces
  96     uint64_t nulls;  // how many all-bits-off bytes
  97     uint64_t fulls;  // how many all-bits-on bytes
  98     uint64_t highs;  // how many bytes with the highest-order bit on
  99     uint64_t bom;    // which (if any) kind of byte-order mark data start with
 100 } stats;
 101 
 102 uint64_t check_bom(unsigned char* data, size_t len) {
 103     const unsigned char* d = data;
 104 
 105     if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
 106         return utf8_bom;
 107     }
 108     if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) {
 109         return utf32le_bom;
 110     }
 111     if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) {
 112         return utf32be_bom;
 113     }
 114     if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) {
 115         return utf16le_bom;
 116     }
 117     if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) {
 118         return utf16be_bom;
 119     }
 120 
 121     return no_bom;
 122 }
 123 
 124 // count_bytes gathers all sorts of byte-related stats, besides a total count
 125 void count_bytes(FILE* r, stats* res) {
 126     unsigned char buf[32 * 1024];
 127     uint64_t tally[256];
 128 
 129     uint64_t bytes = 0;
 130     uint64_t crlf = 0;
 131     uint64_t trails = 0;
 132     uint64_t runes = 0;
 133 
 134     unsigned char prev2 = 0;
 135     unsigned char prev1 = 0;
 136 
 137     memset(tally, 0, sizeof(tally));
 138     memset(res, 0, sizeof(stats));
 139 
 140     while (true) {
 141         const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r);
 142 
 143         if (len < 1) {
 144             break;
 145         }
 146 
 147         if (bytes == 0) {
 148             res->bom = check_bom(buf, len);
 149         }
 150         bytes += len;
 151 
 152         for (size_t i = 0; i < len; i++) {
 153             const unsigned char cur = buf[i];
 154             tally[cur]++;
 155 
 156             crlf += (prev1 == '\r') && (cur == '\n');
 157             trails += (cur == '\n') &&
 158                 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r'));
 159             runes += (cur & 0xc0) != 0x80;
 160 
 161             prev2 = prev1;
 162             prev1 = cur;
 163         }
 164     }
 165 
 166     res->bytes = bytes;
 167     res->crlf = crlf;
 168     res->trails = trails;
 169     res->runes = runes;
 170 
 171     res->lines = tally['\n'];
 172     res->lf = tally['\n'];
 173     res->spaces = tally[' '];
 174     res->tabs = tally['\t'];
 175     res->nulls = tally[0];
 176     res->fulls = tally[255];
 177 
 178     res->highs = 0;
 179     for (size_t i = 128; i < 256; i++) {
 180         res->highs += tally[i];
 181     }
 182 
 183     // count last line for non-empty inputs not ending with a line-feed byte
 184     if (res->bytes > 0 && prev1 != '\n') {
 185         res->lines++;
 186     }
 187 
 188     // count last trail for inputs not ending with a line-feed byte
 189     if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) {
 190         res->trails++;
 191     }
 192 }
 193 
 194 // handle_input gathers stats, and shows a TSV line out of the results
 195 void handle_input(FILE* r, stats* res, char* name) {
 196     // show the filename right away, to reassure users something's happening
 197     printf("%s", name);
 198     fflush(stdout);
 199 
 200     count_bytes(r, res);
 201 
 202     // show results as soon as they're available
 203     printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t",
 204         res->bytes, res->runes, res->lines, res->lf, res->crlf, res->spaces,
 205         res->tabs,res->trails, res->nulls, res->fulls, res->highs);
 206     printf("%s\n", bom_legend[res->bom]);
 207     fflush(stdout);
 208 }
 209 
 210 // handle_file handles data from the filename given, and returns whether the
 211 // file was opened successfully
 212 bool handle_file(char* fname, stats* res) {
 213     FILE* f = fopen(fname, "rb");
 214     if (f == NULL) {
 215         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname);
 216         return false;
 217     }
 218 
 219     handle_input(f, res, fname);
 220     fclose(f);
 221     return true;
 222 }
 223 
 224 // run returns the number of errors
 225 int run(int argc, char** argv) {
 226     size_t empty = 0;
 227     size_t dashes = 0;
 228 
 229     for (int i = 1; i < argc; i++) {
 230         if (argv[i][0] == 0) {
 231             empty++;
 232             continue;
 233         }
 234 
 235         if (argv[i][0] == '-' && argv[i][1] == 0) {
 236             dashes++;
 237         }
 238     }
 239 
 240     if (dashes > 1) {
 241         const char* msg = "can't use a dash (stdin) as input more than once";
 242         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 243         return 1;
 244     }
 245 
 246     // show header line right away, to reassure users the app is working
 247     printf("%s\n", header);
 248     fflush(stdout);
 249 
 250     // if output is done, don't even bother doing anything
 251     if (feof(stdout)) {
 252         return 0;
 253     }
 254 
 255     // use stdin when not given any filepaths, or when all paths are empty
 256     if (argc <= 1 || empty == argc - 1) {
 257         stats res;
 258         handle_input(stdin, &res, "-");
 259         return 0;
 260     }
 261 
 262     stats res;
 263     size_t errors = 0;
 264 
 265     for (int i = 1; i < argc; i++) {
 266         // if output is done while being piped, quit right away
 267         if (feof(stdout)) {
 268             return errors;
 269         }
 270 
 271         // ignore empty names
 272         if (argv[i][0] == 0) {
 273             continue;
 274         }
 275 
 276         // handle `-` as stdin
 277         if (argv[i][0] == '-' && argv[i][1] == 0) {
 278             handle_input(stdin, &res, argv[i]);
 279             continue;
 280         }
 281 
 282         errors += !handle_file(argv[i], &res);
 283     }
 284 
 285     return errors;
 286 }
 287 
 288 int main(int argc, char** argv) {
 289 #ifdef _WIN32
 290     setmode(fileno(stdin), O_BINARY);
 291     // ensure output lines end in LF instead of CRLF on windows
 292     setmode(fileno(stdout), O_BINARY);
 293     setmode(fileno(stderr), O_BINARY);
 294 #endif
 295 
 296     return run(argc, argv) == 0 ? 0 : 1;
 297 }