File: coby.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 coby [files...]
  27 
  28 
  29 COunt BYtes finds out some simple byte-related stats, counting
  30 
  31     - bytes
  32     - lines
  33     - how many lines have trailing spaces
  34     - how many lines end with a CRLF pair
  35     - all-off (0) bytes
  36     - all-on (255) bytes
  37     - high-bytes (128+)
  38     - which (if any) byte-order mark the data start with
  39 
  40 The output is TSV (tab-separated values) lines, where the first line has
  41 all the column names.
  42 
  43 When no filepaths are given, the standard input is used by default.
  44 */
  45 
  46 /*
  47 You can build this command-line app by running
  48 
  49 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./coby ./coby.c
  50 */
  51 
  52 #include <stdbool.h>
  53 #include <stdint.h>
  54 #include <stdio.h>
  55 #include <stdlib.h>
  56 #include <string.h>
  57 
  58 #ifdef _WIN32
  59 #include <fcntl.h>
  60 #include <windows.h>
  61 #endif
  62 
  63 #ifdef RED_ERRORS
  64 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  65 #ifdef __APPLE__
  66 #define ERROR_STYLE "\x1b[31m"
  67 #endif
  68 #define RESET_STYLE "\x1b[0m"
  69 #else
  70 #define ERROR_STYLE
  71 #define RESET_STYLE
  72 #endif
  73 
  74 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  75 
  76 #ifndef IBUF_SIZE
  77 #define IBUF_SIZE (32 * 1024)
  78 #endif
  79 
  80 const char* header = ""
  81     "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs"
  82     "\ttrails\tnulls\tfulls\thighs\tbom";
  83 
  84 enum {
  85     no_bom = 0,
  86     utf8_bom = 1,
  87     utf16le_bom = 2,
  88     utf16be_bom = 3,
  89     utf32le_bom = 4,
  90     utf32be_bom = 5,
  91 };
  92 
  93 const char* bom_legend[] = {
  94     "",
  95     "UTF-8",
  96     "UTF-16 LE",
  97     "UTF-16 BE",
  98     "UTF-32 LE",
  99     "UTF-32 BE",
 100 };
 101 
 102 // stats holds all byte-related counts this app deals with
 103 typedef struct stats {
 104     uint64_t bytes;  // the total byte-count
 105     uint64_t lines;  // how many plain-text lines
 106     uint64_t lf;     // how many line-feeds
 107     uint64_t crlf;   // how many carriage-return/line-feed pairs
 108     uint64_t spaces; // how many spaces
 109     uint64_t tabs;   // how many tabs
 110     uint64_t trails; // how many plain-text lines with trailing spaces
 111     uint64_t nulls;  // how many all-bits-off bytes
 112     uint64_t fulls;  // how many all-bits-on bytes
 113     uint64_t highs;  // how many bytes with the highest-order bit on
 114     uint64_t bom;    // which (if any) kind of byte-order mark data start with
 115 } stats;
 116 
 117 uint64_t check_bom(unsigned char* data, size_t len) {
 118     const unsigned char* d = data;
 119 
 120     if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
 121         return utf8_bom;
 122     }
 123     if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) {
 124         return utf32le_bom;
 125     }
 126     if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) {
 127         return utf32be_bom;
 128     }
 129     if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) {
 130         return utf16le_bom;
 131     }
 132     if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) {
 133         return utf16be_bom;
 134     }
 135 
 136     return no_bom;
 137 }
 138 
 139 // count_bytes gathers all sorts of byte-related stats, besides a total count
 140 void count_bytes(FILE* r, stats* res) {
 141     unsigned char buf[IBUF_SIZE];
 142     uint64_t tally[256];
 143 
 144     uint64_t bytes = 0;
 145     uint64_t crlf = 0;
 146     uint64_t trails = 0;
 147 
 148     unsigned char prev2 = 0;
 149     unsigned char prev1 = 0;
 150 
 151     memset(tally, 0, sizeof(tally));
 152     memset(res, 0, sizeof(stats));
 153 
 154     while (true) {
 155         const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r);
 156 
 157         if (len < 1) {
 158             break;
 159         }
 160 
 161         if (bytes == 0) {
 162             res->bom = check_bom(buf, len);
 163         }
 164         bytes += len;
 165 
 166         for (size_t i = 0; i < len; i++) {
 167             const unsigned char cur = buf[i];
 168             tally[cur]++;
 169 
 170             crlf += (prev1 == '\r') && (cur == '\n');
 171             trails += (cur == '\n') &&
 172                 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r'));
 173 
 174             prev2 = prev1;
 175             prev1 = cur;
 176         }
 177     }
 178 
 179     res->bytes = bytes;
 180     res->crlf = crlf;
 181     res->trails = trails;
 182 
 183     res->lines = tally['\n'];
 184     res->lf = tally['\n'];
 185     res->spaces = tally[' '];
 186     res->tabs = tally['\t'];
 187     res->nulls = tally[0];
 188     res->fulls = tally[255];
 189 
 190     res->highs = 0;
 191     for (size_t i = 128; i < 256; i++) {
 192         res->highs += tally[i];
 193     }
 194 
 195     // count last line for non-empty inputs not ending with a line-feed byte
 196     if (res->bytes > 0 && prev1 != '\n') {
 197         res->lines++;
 198     }
 199 
 200     // count last trail for inputs not ending with a line-feed byte
 201     if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) {
 202         res->trails++;
 203     }
 204 }
 205 
 206 // handle_input gathers stats, and shows a TSV line out of the results
 207 void handle_input(FILE* r, stats* res, const char* name) {
 208     // show the filename right away, to reassure users something's happening
 209     printf("%s", name);
 210     fflush(stdout);
 211 
 212     count_bytes(r, res);
 213 
 214     // show results as soon as they're available
 215     printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu",
 216         res->bytes, res->lines, res->lf, res->crlf, res->spaces,
 217         res->tabs,res->trails, res->nulls, res->fulls, res->highs);
 218     printf("\t%s\n", bom_legend[res->bom]);
 219     fflush(stdout);
 220 }
 221 
 222 // handle_file handles data from the filename given, and returns whether the
 223 // file was opened successfully
 224 bool handle_file(const char* path, stats* res) {
 225     FILE* f = fopen(path, "rb");
 226     if (f == NULL) {
 227         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 228         return false;
 229     }
 230 
 231     handle_input(f, res, path);
 232     fclose(f);
 233     return true;
 234 }
 235 
 236 // run returns the number of errors
 237 int run(int argc, char** argv) {
 238     size_t empty = 0;
 239     size_t dashes = 0;
 240 
 241     for (int i = 1; i < argc; i++) {
 242         if (argv[i][0] == 0) {
 243             empty++;
 244             continue;
 245         }
 246 
 247         if (argv[i][0] == '-' && argv[i][1] == 0) {
 248             dashes++;
 249         }
 250     }
 251 
 252     if (dashes > 1) {
 253         const char* msg = "can't use a dash (stdin) as input more than once";
 254         fprintf(stderr, ERROR_LINE("%s"), msg);
 255         return 1;
 256     }
 257 
 258     // show header line right away, to reassure users the app is working
 259     printf("%s\n", header);
 260 
 261     // if output is done, don't even bother doing anything
 262     if (feof(stdout)) {
 263         return 0;
 264     }
 265 
 266     // use stdin when not given any filepaths, or when all paths are empty
 267     if (argc <= 1 || empty == argc - 1) {
 268         stats res;
 269         handle_input(stdin, &res, "-");
 270         return 0;
 271     }
 272 
 273     stats res;
 274     size_t errors = 0;
 275 
 276     for (int i = 1; i < argc; i++) {
 277         // if output is done while being piped, quit right away
 278         if (feof(stdout)) {
 279             return errors;
 280         }
 281 
 282         // ignore empty names
 283         if (argv[i][0] == 0) {
 284             continue;
 285         }
 286 
 287         // handle `-` as stdin
 288         if (argv[i][0] == '-' && argv[i][1] == 0) {
 289             handle_input(stdin, &res, argv[i]);
 290             continue;
 291         }
 292 
 293         errors += !handle_file(argv[i], &res);
 294     }
 295 
 296     return errors;
 297 }
 298 
 299 int main(int argc, char** argv) {
 300 #ifdef _WIN32
 301     setmode(fileno(stdin), O_BINARY);
 302     // ensure output lines end in LF instead of CRLF on windows
 303     setmode(fileno(stdout), O_BINARY);
 304     setmode(fileno(stderr), O_BINARY);
 305 #endif
 306 
 307     return run(argc, argv) == 0 ? 0 : 1;
 308 }