File: coby.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 coby [files...]
  27 
  28 
  29 COunt BYtes finds out some simple byte-related stats, counting
  30 
  31     - bytes
  32     - runes, which are UTF-8 code-points
  33     - lines
  34     - how many lines have trailing spaces
  35     - how many lines end with a CRLF pair
  36     - all-off (0) bytes
  37     - all-on (255) bytes
  38     - high-bytes (128+)
  39     - which (if any) byte-order mark the data start with
  40 
  41 The output is TSV (tab-separated values) lines, where the first line has
  42 all the column names.
  43 
  44 When no filepaths are given, the standard input is used by default.
  45 */
  46 
  47 /*
  48 You can build this command-line app by running
  49 
  50 cc -Wall -s -O2 -o ./coby ./coby.c
  51 */
  52 
  53 #include <stdbool.h>
  54 #include <stdint.h>
  55 #include <stdio.h>
  56 #include <stdlib.h>
  57 #include <string.h>
  58 
  59 #ifdef _WIN32
  60 #include <fcntl.h>
  61 #include <windows.h>
  62 #endif
  63 
  64 // const char* header = ""
  65 //     "name\tbytes\trunes\tlines\tlf\tcrlf\tspaces\ttabs"
  66 //     "\ttrails\tnulls\tfulls\thighs\tbom";
  67 
  68 const char* header = ""
  69     "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs"
  70     "\ttrails\tnulls\tfulls\thighs\tbom";
  71 
  72 enum {
  73     no_bom = 0,
  74     utf8_bom = 1,
  75     utf16le_bom = 2,
  76     utf16be_bom = 3,
  77     utf32le_bom = 4,
  78     utf32be_bom = 5,
  79 };
  80 
  81 const char* bom_legend[] = {
  82     "",
  83     "UTF-8",
  84     "UTF-16 LE",
  85     "UTF-16 BE",
  86     "UTF-32 LE",
  87     "UTF-32 BE",
  88 };
  89 
  90 // stats holds all byte-related counts this app deals with
  91 typedef struct stats {
  92     uint64_t bytes;  // the total byte-count
  93     uint64_t runes;  // how many utf-8 items
  94     uint64_t lines;  // how many plain-text lines
  95     uint64_t lf;     // how many line-feeds
  96     uint64_t crlf;   // how many carriage-return/line-feed pairs
  97     uint64_t spaces; // how many spaces
  98     uint64_t tabs;   // how many tabs
  99     uint64_t trails; // how many plain-text lines with trailing spaces
 100     uint64_t nulls;  // how many all-bits-off bytes
 101     uint64_t fulls;  // how many all-bits-on bytes
 102     uint64_t highs;  // how many bytes with the highest-order bit on
 103     uint64_t bom;    // which (if any) kind of byte-order mark data start with
 104 } stats;
 105 
 106 uint64_t check_bom(unsigned char* data, size_t len) {
 107     const unsigned char* d = data;
 108 
 109     if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
 110         return utf8_bom;
 111     }
 112     if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) {
 113         return utf32le_bom;
 114     }
 115     if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) {
 116         return utf32be_bom;
 117     }
 118     if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) {
 119         return utf16le_bom;
 120     }
 121     if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) {
 122         return utf16be_bom;
 123     }
 124 
 125     return no_bom;
 126 }
 127 
 128 // count_bytes gathers all sorts of byte-related stats, besides a total count
 129 void count_bytes(FILE* r, stats* res) {
 130     unsigned char buf[32 * 1024];
 131     uint64_t tally[256];
 132 
 133     uint64_t bytes = 0;
 134     uint64_t crlf = 0;
 135     uint64_t trails = 0;
 136     uint64_t runes = 0;
 137 
 138     unsigned char prev2 = 0;
 139     unsigned char prev1 = 0;
 140 
 141     memset(tally, 0, sizeof(tally));
 142     memset(res, 0, sizeof(stats));
 143 
 144     while (true) {
 145         const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r);
 146 
 147         if (len < 1) {
 148             break;
 149         }
 150 
 151         if (bytes == 0) {
 152             res->bom = check_bom(buf, len);
 153         }
 154         bytes += len;
 155 
 156         for (size_t i = 0; i < len; i++) {
 157             const unsigned char cur = buf[i];
 158             tally[cur]++;
 159 
 160             crlf += (prev1 == '\r') && (cur == '\n');
 161             trails += (cur == '\n') &&
 162                 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r'));
 163             // runes += (cur & 0xc0) != 0x80;
 164 
 165             prev2 = prev1;
 166             prev1 = cur;
 167         }
 168     }
 169 
 170     res->bytes = bytes;
 171     res->crlf = crlf;
 172     res->trails = trails;
 173     res->runes = runes;
 174 
 175     res->lines = tally['\n'];
 176     res->lf = tally['\n'];
 177     res->spaces = tally[' '];
 178     res->tabs = tally['\t'];
 179     res->nulls = tally[0];
 180     res->fulls = tally[255];
 181 
 182     res->highs = 0;
 183     for (size_t i = 128; i < 256; i++) {
 184         res->highs += tally[i];
 185     }
 186 
 187     // count last line for non-empty inputs not ending with a line-feed byte
 188     if (res->bytes > 0 && prev1 != '\n') {
 189         res->lines++;
 190     }
 191 
 192     // count last trail for inputs not ending with a line-feed byte
 193     if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) {
 194         res->trails++;
 195     }
 196 }
 197 
 198 // handle_input gathers stats, and shows a TSV line out of the results
 199 void handle_input(FILE* r, stats* res, const char* name) {
 200     // show the filename right away, to reassure users something's happening
 201     printf("%s", name);
 202     fflush(stdout);
 203 
 204     count_bytes(r, res);
 205 
 206     // show results as soon as they're available
 207     // printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t",
 208     //     res->bytes, res->runes, res->lines, res->lf, res->crlf, res->spaces,
 209     //     res->tabs,res->trails, res->nulls, res->fulls, res->highs);
 210     // printf("%s\n", bom_legend[res->bom]);
 211 
 212     // show results as soon as they're available
 213     printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t",
 214         res->bytes, res->lines, res->lf, res->crlf, res->spaces,
 215         res->tabs,res->trails, res->nulls, res->fulls, res->highs);
 216     printf("%s\n", bom_legend[res->bom]);
 217 }
 218 
 219 // handle_file handles data from the filename given, and returns whether the
 220 // file was opened successfully
 221 bool handle_file(const char* fname, stats* res) {
 222     FILE* f = fopen(fname, "rb");
 223     if (f == NULL) {
 224         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname);
 225         return false;
 226     }
 227 
 228     handle_input(f, res, fname);
 229     fclose(f);
 230     return true;
 231 }
 232 
 233 // run returns the number of errors
 234 int run(int argc, char** argv) {
 235     size_t empty = 0;
 236     size_t dashes = 0;
 237 
 238     for (int i = 1; i < argc; i++) {
 239         if (argv[i][0] == 0) {
 240             empty++;
 241             continue;
 242         }
 243 
 244         if (argv[i][0] == '-' && argv[i][1] == 0) {
 245             dashes++;
 246         }
 247     }
 248 
 249     if (dashes > 1) {
 250         const char* msg = "can't use a dash (stdin) as input more than once";
 251         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 252         return 1;
 253     }
 254 
 255     // show header line right away, to reassure users the app is working
 256     printf("%s\n", header);
 257 
 258     // if output is done, don't even bother doing anything
 259     if (feof(stdout)) {
 260         return 0;
 261     }
 262 
 263     // use stdin when not given any filepaths, or when all paths are empty
 264     if (argc <= 1 || empty == argc - 1) {
 265         stats res;
 266         handle_input(stdin, &res, "-");
 267         return 0;
 268     }
 269 
 270     stats res;
 271     size_t errors = 0;
 272 
 273     for (int i = 1; i < argc; i++) {
 274         // if output is done while being piped, quit right away
 275         if (feof(stdout)) {
 276             return errors;
 277         }
 278 
 279         // ignore empty names
 280         if (argv[i][0] == 0) {
 281             continue;
 282         }
 283 
 284         // handle `-` as stdin
 285         if (argv[i][0] == '-' && argv[i][1] == 0) {
 286             handle_input(stdin, &res, argv[i]);
 287             continue;
 288         }
 289 
 290         errors += !handle_file(argv[i], &res);
 291     }
 292 
 293     return errors;
 294 }
 295 
 296 int main(int argc, char** argv) {
 297 #ifdef _WIN32
 298     setmode(fileno(stdin), O_BINARY);
 299     // ensure output lines end in LF instead of CRLF on windows
 300     setmode(fileno(stdout), O_BINARY);
 301     setmode(fileno(stderr), O_BINARY);
 302 #endif
 303 
 304     return run(argc, argv) == 0 ? 0 : 1;
 305 }