File: coby.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 coby [files...]
  27 
  28 
  29 COunt BYtes finds out some simple byte-related stats, counting
  30 
  31     - bytes
  32     - lines
  33     - how many lines have trailing spaces
  34     - how many lines end with a CRLF pair
  35     - all-off (0) bytes
  36     - all-on (255) bytes
  37     - high-bytes (128+)
  38     - which (if any) byte-order mark the data start with
  39 
  40 The output is TSV (tab-separated values) lines, where the first line has
  41 all the column names.
  42 
  43 When no filepaths are given, the standard input is used by default.
  44 */
  45 
  46 /*
  47 You can build this command-line app by running
  48 
  49 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./coby ./coby.c
  50 */
  51 
  52 #include <stdbool.h>
  53 #include <stdint.h>
  54 #include <stdio.h>
  55 #include <stdlib.h>
  56 #include <string.h>
  57 
  58 #ifdef _WIN32
  59 #include <fcntl.h>
  60 #include <windows.h>
  61 #endif
  62 
  63 #ifdef RED_ERRORS
  64 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  65 #ifdef __APPLE__
  66 #define ERROR_STYLE "\x1b[31m"
  67 #endif
  68 #define RESET_STYLE "\x1b[0m"
  69 #else
  70 #define ERROR_STYLE
  71 #define RESET_STYLE
  72 #endif
  73 
  74 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  75 
  76 const char* header = ""
  77     "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs"
  78     "\ttrails\tnulls\tfulls\thighs\tbom";
  79 
  80 enum {
  81     no_bom = 0,
  82     utf8_bom = 1,
  83     utf16le_bom = 2,
  84     utf16be_bom = 3,
  85     utf32le_bom = 4,
  86     utf32be_bom = 5,
  87 };
  88 
  89 const char* bom_legend[] = {
  90     "",
  91     "UTF-8",
  92     "UTF-16 LE",
  93     "UTF-16 BE",
  94     "UTF-32 LE",
  95     "UTF-32 BE",
  96 };
  97 
  98 // stats holds all byte-related counts this app deals with
  99 typedef struct stats {
 100     uint64_t bytes;  // the total byte-count
 101     uint64_t lines;  // how many plain-text lines
 102     uint64_t lf;     // how many line-feeds
 103     uint64_t crlf;   // how many carriage-return/line-feed pairs
 104     uint64_t spaces; // how many spaces
 105     uint64_t tabs;   // how many tabs
 106     uint64_t trails; // how many plain-text lines with trailing spaces
 107     uint64_t nulls;  // how many all-bits-off bytes
 108     uint64_t fulls;  // how many all-bits-on bytes
 109     uint64_t highs;  // how many bytes with the highest-order bit on
 110     uint64_t bom;    // which (if any) kind of byte-order mark data start with
 111 } stats;
 112 
 113 uint64_t check_bom(unsigned char* data, size_t len) {
 114     const unsigned char* d = data;
 115 
 116     if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) {
 117         return utf8_bom;
 118     }
 119     if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) {
 120         return utf32le_bom;
 121     }
 122     if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) {
 123         return utf32be_bom;
 124     }
 125     if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) {
 126         return utf16le_bom;
 127     }
 128     if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) {
 129         return utf16be_bom;
 130     }
 131 
 132     return no_bom;
 133 }
 134 
 135 // count_bytes gathers all sorts of byte-related stats, besides a total count
 136 void count_bytes(FILE* r, stats* res) {
 137     unsigned char buf[32 * 1024];
 138     uint64_t tally[256];
 139 
 140     uint64_t bytes = 0;
 141     uint64_t crlf = 0;
 142     uint64_t trails = 0;
 143 
 144     unsigned char prev2 = 0;
 145     unsigned char prev1 = 0;
 146 
 147     memset(tally, 0, sizeof(tally));
 148     memset(res, 0, sizeof(stats));
 149 
 150     while (true) {
 151         const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r);
 152 
 153         if (len < 1) {
 154             break;
 155         }
 156 
 157         if (bytes == 0) {
 158             res->bom = check_bom(buf, len);
 159         }
 160         bytes += len;
 161 
 162         for (size_t i = 0; i < len; i++) {
 163             const unsigned char cur = buf[i];
 164             tally[cur]++;
 165 
 166             crlf += (prev1 == '\r') && (cur == '\n');
 167             trails += (cur == '\n') &&
 168                 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r'));
 169 
 170             prev2 = prev1;
 171             prev1 = cur;
 172         }
 173     }
 174 
 175     res->bytes = bytes;
 176     res->crlf = crlf;
 177     res->trails = trails;
 178 
 179     res->lines = tally['\n'];
 180     res->lf = tally['\n'];
 181     res->spaces = tally[' '];
 182     res->tabs = tally['\t'];
 183     res->nulls = tally[0];
 184     res->fulls = tally[255];
 185 
 186     res->highs = 0;
 187     for (size_t i = 128; i < 256; i++) {
 188         res->highs += tally[i];
 189     }
 190 
 191     // count last line for non-empty inputs not ending with a line-feed byte
 192     if (res->bytes > 0 && prev1 != '\n') {
 193         res->lines++;
 194     }
 195 
 196     // count last trail for inputs not ending with a line-feed byte
 197     if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) {
 198         res->trails++;
 199     }
 200 }
 201 
 202 // handle_input gathers stats, and shows a TSV line out of the results
 203 void handle_input(FILE* r, stats* res, const char* name) {
 204     // show the filename right away, to reassure users something's happening
 205     printf("%s", name);
 206     fflush(stdout);
 207 
 208     count_bytes(r, res);
 209 
 210     // show results as soon as they're available
 211     printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu",
 212         res->bytes, res->lines, res->lf, res->crlf, res->spaces,
 213         res->tabs,res->trails, res->nulls, res->fulls, res->highs);
 214     printf("\t%s\n", bom_legend[res->bom]);
 215     fflush(stdout);
 216 }
 217 
 218 // handle_file handles data from the filename given, and returns whether the
 219 // file was opened successfully
 220 bool handle_file(const char* path, stats* res) {
 221     FILE* f = fopen(path, "rb");
 222     if (f == NULL) {
 223         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 224         return false;
 225     }
 226 
 227     handle_input(f, res, path);
 228     fclose(f);
 229     return true;
 230 }
 231 
 232 // run returns the number of errors
 233 int run(int argc, char** argv) {
 234     size_t empty = 0;
 235     size_t dashes = 0;
 236 
 237     for (int i = 1; i < argc; i++) {
 238         if (argv[i][0] == 0) {
 239             empty++;
 240             continue;
 241         }
 242 
 243         if (argv[i][0] == '-' && argv[i][1] == 0) {
 244             dashes++;
 245         }
 246     }
 247 
 248     if (dashes > 1) {
 249         const char* msg = "can't use a dash (stdin) as input more than once";
 250         fprintf(stderr, ERROR_LINE("%s"), msg);
 251         return 1;
 252     }
 253 
 254     // show header line right away, to reassure users the app is working
 255     printf("%s\n", header);
 256 
 257     // if output is done, don't even bother doing anything
 258     if (feof(stdout)) {
 259         return 0;
 260     }
 261 
 262     // use stdin when not given any filepaths, or when all paths are empty
 263     if (argc <= 1 || empty == argc - 1) {
 264         stats res;
 265         handle_input(stdin, &res, "-");
 266         return 0;
 267     }
 268 
 269     stats res;
 270     size_t errors = 0;
 271 
 272     for (int i = 1; i < argc; i++) {
 273         // if output is done while being piped, quit right away
 274         if (feof(stdout)) {
 275             return errors;
 276         }
 277 
 278         // ignore empty names
 279         if (argv[i][0] == 0) {
 280             continue;
 281         }
 282 
 283         // handle `-` as stdin
 284         if (argv[i][0] == '-' && argv[i][1] == 0) {
 285             handle_input(stdin, &res, argv[i]);
 286             continue;
 287         }
 288 
 289         errors += !handle_file(argv[i], &res);
 290     }
 291 
 292     return errors;
 293 }
 294 
 295 int main(int argc, char** argv) {
 296 #ifdef _WIN32
 297     setmode(fileno(stdin), O_BINARY);
 298     // ensure output lines end in LF instead of CRLF on windows
 299     setmode(fileno(stdout), O_BINARY);
 300     setmode(fileno(stderr), O_BINARY);
 301 #endif
 302 
 303     return run(argc, argv) == 0 ? 0 : 1;
 304 }