File: coby.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 coby [files...]
  27 
  28 
  29 COunt BYtes finds out some simple byte-related stats, counting
  30 
  31     - bytes
  32     - lines
  33     - how many lines have trailing spaces
  34     - how many lines end with a CRLF pair
  35     - all-off (0) bytes
  36     - all-on (255) bytes
  37     - high-bytes (128+)
  38     - which (if any) byte-order mark the data start with
  39 
  40 The output is TSV (tab-separated values) lines, where the first line has
  41 all the column names.
  42 
  43 When no filepaths are given, the standard input is used by default.
  44 */
  45 
  46 /*
  47 You can build this command-line app by running
  48 
  49 cc -Wall -s -O2 -march=native -mtune=native -flto -o ./coby ./coby.c
  50 */
  51 
  52 #include <stdbool.h>
  53 #include <stdint.h>
  54 #include <stdio.h>
  55 #include <stdlib.h>
  56 #include <string.h>
  57 
  58 #ifdef _WIN32
  59 #include <fcntl.h>
  60 #include <windows.h>
  61 #endif
  62 
  63 #ifdef RED_ERRORS
  64 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  65 #ifdef __APPLE__
  66 #define ERROR_STYLE "\x1b[31m"
  67 #endif
  68 #define RESET_STYLE "\x1b[0m"
  69 #else
  70 #define ERROR_STYLE
  71 #define RESET_STYLE
  72 #endif
  73 
  74 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  75 
  76 #ifndef IBUF_SIZE
  77 #define IBUF_SIZE (32 * 1024)
  78 #endif
  79 
  80 const char* header = ""
  81     "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs"
  82     "\ttrails\tnulls\tfulls\thighs\tbom";
  83 
  84 enum {
  85     no_bom = 0,
  86     utf8_bom = 1,
  87     utf16le_bom = 2,
  88     utf16be_bom = 3,
  89     utf32le_bom = 4,
  90     utf32be_bom = 5,
  91 };
  92 
  93 const char* bom_legend[] = {
  94     "",
  95     "UTF-8",
  96     "UTF-16 LE",
  97     "UTF-16 BE",
  98     "UTF-32 LE",
  99     "UTF-32 BE",
 100 };
 101 
 102 // stats holds all byte-related counts this app deals with
 103 typedef struct stats {
 104     uint64_t bytes;  // the total byte-count
 105     uint64_t lines;  // how many plain-text lines
 106     uint64_t lf;     // how many line-feeds
 107     uint64_t crlf;   // how many carriage-return/line-feed pairs
 108     uint64_t spaces; // how many spaces
 109     uint64_t tabs;   // how many tabs
 110     uint64_t trails; // how many plain-text lines with trailing spaces
 111     uint64_t nulls;  // how many all-bits-off bytes
 112     uint64_t fulls;  // how many all-bits-on bytes
 113     uint64_t highs;  // how many bytes with the highest-order bit on
 114     uint64_t bom;    // which (if any) kind of byte-order mark data start with
 115 } stats;
 116 
 117 uint64_t check_bom(unsigned char* data, size_t len) {
 118     const unsigned char* d = data;
 119 
 120     if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
 121         return utf8_bom;
 122     }
 123     if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) {
 124         return utf32le_bom;
 125     }
 126     if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) {
 127         return utf32be_bom;
 128     }
 129     if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) {
 130         return utf16le_bom;
 131     }
 132     if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) {
 133         return utf16be_bom;
 134     }
 135 
 136     return no_bom;
 137 }
 138 
 139 // count_bytes gathers all sorts of byte-related stats, besides a total count
 140 void count_bytes(FILE* r, stats* res) {
 141     unsigned char buf[IBUF_SIZE];
 142     uint64_t tally[256];
 143 
 144     uint64_t bytes = 0;
 145     uint64_t crlf = 0;
 146     uint64_t trails = 0;
 147 
 148     unsigned char prev2 = 0;
 149     unsigned char prev1 = 0;
 150     unsigned char cur = 0;
 151 
 152     memset(tally, 0, sizeof(tally));
 153     memset(res, 0, sizeof(stats));
 154 
 155     while (true) {
 156         const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r);
 157         if (len < 1) {
 158             break;
 159         }
 160 
 161         if (bytes == 0) {
 162             res->bom = check_bom(buf, len);
 163         }
 164         bytes += len;
 165 
 166         for (size_t i = 0; i < len; i++) {
 167             cur = buf[i];
 168             tally[cur]++;
 169 
 170             if (cur == '\n') {
 171                 if (prev1 == ' ') {
 172                     trails++;
 173                 } else if (prev1 == '\r') {
 174                     crlf++;
 175                     if (prev2 == ' ') {
 176                         trails++;
 177                     }
 178                 }
 179             }
 180 
 181             prev2 = prev1;
 182             prev1 = cur;
 183         }
 184     }
 185 
 186     if (cur == ' ') {
 187         trails++;
 188     } else if (cur == '\r') {
 189         crlf++;
 190         if (prev1 == ' ') {
 191             trails++;
 192         }
 193     }
 194 
 195     res->bytes = bytes;
 196     res->crlf = crlf;
 197     res->trails = trails;
 198 
 199     res->lines = tally['\n'];
 200     res->lf = tally['\n'];
 201     res->spaces = tally[' '];
 202     res->tabs = tally['\t'];
 203     res->nulls = tally[0];
 204     res->fulls = tally[255];
 205 
 206     res->highs = 0;
 207     for (size_t i = 128; i < 256; i++) {
 208         res->highs += tally[i];
 209     }
 210 
 211     // count last line for non-empty inputs not ending with a line-feed byte
 212     if (res->bytes > 0 && prev1 != '\n') {
 213         res->lines++;
 214     }
 215 
 216     // count last trail for inputs not ending with a line-feed byte
 217     if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) {
 218         res->trails++;
 219     }
 220 }
 221 
 222 // handle_input gathers stats, and shows a TSV line out of the results
 223 void handle_input(FILE* r, stats* res, const char* name) {
 224     // show the filename right away, to reassure users something's happening
 225     printf("%s", name);
 226     fflush(stdout);
 227 
 228     count_bytes(r, res);
 229 
 230     // show results as soon as they're available
 231     printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu",
 232         res->bytes, res->lines, res->lf, res->crlf, res->spaces,
 233         res->tabs,res->trails, res->nulls, res->fulls, res->highs);
 234     printf("\t%s\n", bom_legend[res->bom]);
 235     fflush(stdout);
 236 }
 237 
 238 // handle_file handles data from the filename given, and returns whether the
 239 // file was opened successfully
 240 bool handle_file(const char* path, stats* res) {
 241     FILE* f = fopen(path, "rb");
 242     if (f == NULL) {
 243         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 244         return false;
 245     }
 246 
 247     handle_input(f, res, path);
 248     fclose(f);
 249     return true;
 250 }
 251 
 252 // run returns the number of errors
 253 int run(int argc, char** argv) {
 254     size_t empty = 0;
 255     size_t dashes = 0;
 256 
 257     for (int i = 1; i < argc; i++) {
 258         if (argv[i][0] == 0) {
 259             empty++;
 260             continue;
 261         }
 262 
 263         if (strcmp(argv[i], "-") == 0) {
 264             dashes++;
 265         }
 266     }
 267 
 268     if (dashes > 1) {
 269         const char* msg = "can't use a dash (stdin) as input more than once";
 270         fprintf(stderr, ERROR_LINE("%s"), msg);
 271         return 1;
 272     }
 273 
 274     // show header line right away, to reassure users the app is working
 275     printf("%s\n", header);
 276 
 277     // if output is done, don't even bother doing anything
 278     if (feof(stdout)) {
 279         return 0;
 280     }
 281 
 282     // use stdin when not given any filepaths, or when all paths are empty
 283     if (argc <= 1 || empty == argc - 1) {
 284         stats res;
 285         handle_input(stdin, &res, "-");
 286         return 0;
 287     }
 288 
 289     stats res;
 290     size_t errors = 0;
 291 
 292     for (int i = 1; i < argc; i++) {
 293         // if output is done while being piped, quit right away
 294         if (feof(stdout)) {
 295             return errors;
 296         }
 297 
 298         // ignore empty names
 299         if (argv[i][0] == 0) {
 300             continue;
 301         }
 302 
 303         // handle `-` as stdin
 304         if (strcmp(argv[i], "-") == 0) {
 305             handle_input(stdin, &res, argv[i]);
 306             continue;
 307         }
 308 
 309         errors += !handle_file(argv[i], &res);
 310     }
 311 
 312     return errors;
 313 }
 314 
 315 int main(int argc, char** argv) {
 316 #ifdef _WIN32
 317     setmode(fileno(stdin), O_BINARY);
 318     // ensure output lines end in LF instead of CRLF on windows
 319     setmode(fileno(stdout), O_BINARY);
 320     setmode(fileno(stderr), O_BINARY);
 321 #endif
 322 
 323     return run(argc, argv) == 0 ? 0 : 1;
 324 }