File: coby.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 coby [files...]
  27 
  28 
  29 COunt BYtes finds out some simple byte-related stats, counting
  30 
  31     - bytes
  32     - lines
  33     - how many lines have trailing spaces
  34     - how many lines end with a CRLF pair
  35     - all-off (0) bytes
  36     - all-on (255) bytes
  37     - high-bytes (128+)
  38     - which (if any) byte-order mark the data start with
  39 
  40 The output is TSV (tab-separated values) lines, where the first line has
  41 all the column names.
  42 
  43 When no filepaths are given, the standard input is used by default.
  44 */
  45 
  46 /*
  47 You can build this command-line app by running
  48 
  49 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./coby ./coby.c
  50 */
  51 
  52 #include <stdbool.h>
  53 #include <stdint.h>
  54 #include <stdio.h>
  55 #include <stdlib.h>
  56 #include <string.h>
  57 
  58 #ifdef _WIN32
  59 #include <fcntl.h>
  60 #include <windows.h>
  61 #endif
  62 
  63 #ifdef RED_ERRORS
  64 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  65 #ifdef __APPLE__
  66 #define ERROR_STYLE "\x1b[31m"
  67 #endif
  68 #define RESET_STYLE "\x1b[0m"
  69 #else
  70 #define ERROR_STYLE
  71 #define RESET_STYLE
  72 #endif
  73 
  74 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  75 
  76 #ifndef IBUF_SIZE
  77 #define IBUF_SIZE (32 * 1024)
  78 #endif
  79 
  80 const char* header = ""
  81     "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs"
  82     "\ttrails\tnulls\tfulls\thighs\tbom";
  83 
  84 enum {
  85     no_bom = 0,
  86     utf8_bom = 1,
  87     utf16le_bom = 2,
  88     utf16be_bom = 3,
  89     utf32le_bom = 4,
  90     utf32be_bom = 5,
  91 };
  92 
  93 const char* bom_legend[] = {
  94     "",
  95     "UTF-8",
  96     "UTF-16 LE",
  97     "UTF-16 BE",
  98     "UTF-32 LE",
  99     "UTF-32 BE",
 100 };
 101 
 102 // stats holds all byte-related counts this app deals with
 103 typedef struct stats {
 104     uint64_t bytes;  // the total byte-count
 105     uint64_t lines;  // how many plain-text lines
 106     uint64_t lf;     // how many line-feeds
 107     uint64_t crlf;   // how many carriage-return/line-feed pairs
 108     uint64_t spaces; // how many spaces
 109     uint64_t tabs;   // how many tabs
 110     uint64_t trails; // how many plain-text lines with trailing spaces
 111     uint64_t nulls;  // how many all-bits-off bytes
 112     uint64_t fulls;  // how many all-bits-on bytes
 113     uint64_t highs;  // how many bytes with the highest-order bit on
 114     uint64_t bom;    // which (if any) kind of byte-order mark data start with
 115 } stats;
 116 
 117 uint64_t check_bom(unsigned char* data, size_t len) {
 118     const unsigned char* d = data;
 119 
 120     if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
 121         return utf8_bom;
 122     }
 123     if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) {
 124         return utf32le_bom;
 125     }
 126     if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) {
 127         return utf32be_bom;
 128     }
 129     if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) {
 130         return utf16le_bom;
 131     }
 132     if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) {
 133         return utf16be_bom;
 134     }
 135 
 136     return no_bom;
 137 }
 138 
 139 // count_bytes gathers all sorts of byte-related stats, besides a total count
 140 void count_bytes(FILE* r, stats* res) {
 141     unsigned char buf[IBUF_SIZE];
 142     uint64_t tally[256];
 143 
 144     uint64_t bytes = 0;
 145     uint64_t crlf = 0;
 146     uint64_t trails = 0;
 147 
 148     unsigned char prev2 = 0;
 149     unsigned char prev1 = 0;
 150     unsigned char cur = 0;
 151 
 152     memset(tally, 0, sizeof(tally));
 153     memset(res, 0, sizeof(stats));
 154 
 155     while (true) {
 156         const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r);
 157         if (len < 1) {
 158             break;
 159         }
 160 
 161         if (bytes == 0) {
 162             res->bom = check_bom(buf, len);
 163         }
 164         bytes += len;
 165 
 166         for (size_t i = 0; i < len; i++) {
 167             cur = buf[i];
 168             tally[cur]++;
 169             if (cur == '\n') {
 170                 if (prev1 == ' ') {
 171                     trails++;
 172                 } else if (prev1 == '\r') {
 173                     crlf++;
 174                     if (prev2 == ' ') {
 175                         trails++;
 176                     }
 177                 }
 178             }
 179 
 180             prev2 = prev1;
 181             prev1 = cur;
 182         }
 183     }
 184 
 185     if (cur == ' ') {
 186         trails++;
 187     } else if (cur == '\r') {
 188         crlf++;
 189         if (prev1 == ' ') {
 190             trails++;
 191         }
 192     }
 193 
 194     res->bytes = bytes;
 195     res->crlf = crlf;
 196     res->trails = trails;
 197 
 198     res->lines = tally['\n'];
 199     res->lf = tally['\n'];
 200     res->spaces = tally[' '];
 201     res->tabs = tally['\t'];
 202     res->nulls = tally[0];
 203     res->fulls = tally[255];
 204 
 205     res->highs = 0;
 206     for (size_t i = 128; i < 256; i++) {
 207         res->highs += tally[i];
 208     }
 209 
 210     // count last line for non-empty inputs not ending with a line-feed byte
 211     if (res->bytes > 0 && prev1 != '\n') {
 212         res->lines++;
 213     }
 214 
 215     // count last trail for inputs not ending with a line-feed byte
 216     if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) {
 217         res->trails++;
 218     }
 219 }
 220 
 221 // handle_input gathers stats, and shows a TSV line out of the results
 222 void handle_input(FILE* r, stats* res, const char* name) {
 223     // show the filename right away, to reassure users something's happening
 224     printf("%s", name);
 225     fflush(stdout);
 226 
 227     count_bytes(r, res);
 228 
 229     // show results as soon as they're available
 230     printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu",
 231         res->bytes, res->lines, res->lf, res->crlf, res->spaces,
 232         res->tabs,res->trails, res->nulls, res->fulls, res->highs);
 233     printf("\t%s\n", bom_legend[res->bom]);
 234     fflush(stdout);
 235 }
 236 
 237 // handle_file handles data from the filename given, and returns whether the
 238 // file was opened successfully
 239 bool handle_file(const char* path, stats* res) {
 240     FILE* f = fopen(path, "rb");
 241     if (f == NULL) {
 242         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 243         return false;
 244     }
 245 
 246     handle_input(f, res, path);
 247     fclose(f);
 248     return true;
 249 }
 250 
 251 // run returns the number of errors
 252 int run(int argc, char** argv) {
 253     size_t empty = 0;
 254     size_t dashes = 0;
 255 
 256     for (int i = 1; i < argc; i++) {
 257         if (argv[i][0] == 0) {
 258             empty++;
 259             continue;
 260         }
 261 
 262         if (argv[i][0] == '-' && argv[i][1] == 0) {
 263             dashes++;
 264         }
 265     }
 266 
 267     if (dashes > 1) {
 268         const char* msg = "can't use a dash (stdin) as input more than once";
 269         fprintf(stderr, ERROR_LINE("%s"), msg);
 270         return 1;
 271     }
 272 
 273     // show header line right away, to reassure users the app is working
 274     printf("%s\n", header);
 275 
 276     // if output is done, don't even bother doing anything
 277     if (feof(stdout)) {
 278         return 0;
 279     }
 280 
 281     // use stdin when not given any filepaths, or when all paths are empty
 282     if (argc <= 1 || empty == argc - 1) {
 283         stats res;
 284         handle_input(stdin, &res, "-");
 285         return 0;
 286     }
 287 
 288     stats res;
 289     size_t errors = 0;
 290 
 291     for (int i = 1; i < argc; i++) {
 292         // if output is done while being piped, quit right away
 293         if (feof(stdout)) {
 294             return errors;
 295         }
 296 
 297         // ignore empty names
 298         if (argv[i][0] == 0) {
 299             continue;
 300         }
 301 
 302         // handle `-` as stdin
 303         if (argv[i][0] == '-' && argv[i][1] == 0) {
 304             handle_input(stdin, &res, argv[i]);
 305             continue;
 306         }
 307 
 308         errors += !handle_file(argv[i], &res);
 309     }
 310 
 311     return errors;
 312 }
 313 
 314 int main(int argc, char** argv) {
 315 #ifdef _WIN32
 316     setmode(fileno(stdin), O_BINARY);
 317     // ensure output lines end in LF instead of CRLF on windows
 318     setmode(fileno(stdout), O_BINARY);
 319     setmode(fileno(stderr), O_BINARY);
 320 #endif
 321 
 322     return run(argc, argv) == 0 ? 0 : 1;
 323 }