File: coby.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 coby [files...] 27 28 29 COunt BYtes finds out some simple byte-related stats, counting 30 31 - bytes 32 - lines 33 - how many lines have trailing spaces 34 - how many lines end with a CRLF pair 35 - all-off (0) bytes 36 - all-on (255) bytes 37 - high-bytes (128+) 38 - which (if any) byte-order mark the data start with 39 40 The output is TSV (tab-separated values) lines, where the first line has 41 all the column names. 42 43 When no filepaths are given, the standard input is used by default. 44 */ 45 46 /* 47 You can build this command-line app by running 48 49 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./coby ./coby.c 50 */ 51 52 #include <stdbool.h> 53 #include <stdint.h> 54 #include <stdio.h> 55 #include <stdlib.h> 56 #include <string.h> 57 58 #ifdef _WIN32 59 #include <fcntl.h> 60 #include <windows.h> 61 #endif 62 63 #ifdef RED_ERRORS 64 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 65 #ifdef __APPLE__ 66 #define ERROR_STYLE "\x1b[31m" 67 #endif 68 #define RESET_STYLE "\x1b[0m" 69 #else 70 #define ERROR_STYLE 71 #define RESET_STYLE 72 #endif 73 74 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 75 76 const char* header = "" 77 "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs" 78 "\ttrails\tnulls\tfulls\thighs\tbom"; 79 80 enum { 81 no_bom = 0, 82 utf8_bom = 1, 83 utf16le_bom = 2, 84 utf16be_bom = 3, 85 utf32le_bom = 4, 86 utf32be_bom = 5, 87 }; 88 89 const char* bom_legend[] = { 90 "", 91 "UTF-8", 92 "UTF-16 LE", 93 "UTF-16 BE", 94 "UTF-32 LE", 95 "UTF-32 BE", 96 }; 97 98 // stats holds all byte-related counts this app deals with 99 typedef struct stats { 100 uint64_t bytes; // the total byte-count 101 uint64_t lines; // how many plain-text lines 102 uint64_t lf; // how many line-feeds 103 uint64_t crlf; // how many carriage-return/line-feed pairs 104 uint64_t spaces; // how many spaces 105 uint64_t tabs; // how many tabs 106 uint64_t trails; // how many plain-text lines with trailing spaces 107 uint64_t nulls; // how many all-bits-off bytes 108 uint64_t fulls; // how many all-bits-on bytes 109 uint64_t highs; // how many bytes with the highest-order bit on 110 uint64_t bom; // which (if any) kind of byte-order mark data start with 111 } stats; 112 113 uint64_t check_bom(unsigned char* data, size_t len) { 114 const unsigned char* d = data; 115 116 if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) { 117 return utf8_bom; 118 } 119 if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) { 120 return utf32le_bom; 121 } 122 if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) { 123 return utf32be_bom; 124 } 125 if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) { 126 return utf16le_bom; 127 } 128 if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) { 129 return utf16be_bom; 130 } 131 132 return no_bom; 133 } 134 135 // count_bytes gathers all sorts of byte-related stats, besides a total count 136 void count_bytes(FILE* r, stats* res) { 137 unsigned char buf[32 * 1024]; 138 uint64_t tally[256]; 139 140 uint64_t bytes = 0; 141 uint64_t crlf = 0; 142 uint64_t trails = 0; 143 144 unsigned char prev2 = 0; 145 unsigned char prev1 = 0; 146 147 memset(tally, 0, sizeof(tally)); 148 memset(res, 0, sizeof(stats)); 149 150 while (true) { 151 const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r); 152 153 if (len < 1) { 154 break; 155 } 156 157 if (bytes == 0) { 158 res->bom = check_bom(buf, len); 159 } 160 bytes += len; 161 162 for (size_t i = 0; i < len; i++) { 163 const unsigned char cur = buf[i]; 164 tally[cur]++; 165 166 crlf += (prev1 == '\r') && (cur == '\n'); 167 trails += (cur == '\n') && 168 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r')); 169 170 prev2 = prev1; 171 prev1 = cur; 172 } 173 } 174 175 res->bytes = bytes; 176 res->crlf = crlf; 177 res->trails = trails; 178 179 res->lines = tally['\n']; 180 res->lf = tally['\n']; 181 res->spaces = tally[' ']; 182 res->tabs = tally['\t']; 183 res->nulls = tally[0]; 184 res->fulls = tally[255]; 185 186 res->highs = 0; 187 for (size_t i = 128; i < 256; i++) { 188 res->highs += tally[i]; 189 } 190 191 // count last line for non-empty inputs not ending with a line-feed byte 192 if (res->bytes > 0 && prev1 != '\n') { 193 res->lines++; 194 } 195 196 // count last trail for inputs not ending with a line-feed byte 197 if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) { 198 res->trails++; 199 } 200 } 201 202 // handle_input gathers stats, and shows a TSV line out of the results 203 void handle_input(FILE* r, stats* res, const char* name) { 204 // show the filename right away, to reassure users something's happening 205 printf("%s", name); 206 fflush(stdout); 207 208 count_bytes(r, res); 209 210 // show results as soon as they're available 211 printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu", 212 res->bytes, res->lines, res->lf, res->crlf, res->spaces, 213 res->tabs,res->trails, res->nulls, res->fulls, res->highs); 214 printf("\t%s\n", bom_legend[res->bom]); 215 fflush(stdout); 216 } 217 218 // handle_file handles data from the filename given, and returns whether the 219 // file was opened successfully 220 bool handle_file(const char* path, stats* res) { 221 FILE* f = fopen(path, "rb"); 222 if (f == NULL) { 223 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 224 return false; 225 } 226 227 handle_input(f, res, path); 228 fclose(f); 229 return true; 230 } 231 232 // run returns the number of errors 233 int run(int argc, char** argv) { 234 size_t empty = 0; 235 size_t dashes = 0; 236 237 for (int i = 1; i < argc; i++) { 238 if (argv[i][0] == 0) { 239 empty++; 240 continue; 241 } 242 243 if (argv[i][0] == '-' && argv[i][1] == 0) { 244 dashes++; 245 } 246 } 247 248 if (dashes > 1) { 249 const char* msg = "can't use a dash (stdin) as input more than once"; 250 fprintf(stderr, ERROR_LINE("%s"), msg); 251 return 1; 252 } 253 254 // show header line right away, to reassure users the app is working 255 printf("%s\n", header); 256 257 // if output is done, don't even bother doing anything 258 if (feof(stdout)) { 259 return 0; 260 } 261 262 // use stdin when not given any filepaths, or when all paths are empty 263 if (argc <= 1 || empty == argc - 1) { 264 stats res; 265 handle_input(stdin, &res, "-"); 266 return 0; 267 } 268 269 stats res; 270 size_t errors = 0; 271 272 for (int i = 1; i < argc; i++) { 273 // if output is done while being piped, quit right away 274 if (feof(stdout)) { 275 return errors; 276 } 277 278 // ignore empty names 279 if (argv[i][0] == 0) { 280 continue; 281 } 282 283 // handle `-` as stdin 284 if (argv[i][0] == '-' && argv[i][1] == 0) { 285 handle_input(stdin, &res, argv[i]); 286 continue; 287 } 288 289 errors += !handle_file(argv[i], &res); 290 } 291 292 return errors; 293 } 294 295 int main(int argc, char** argv) { 296 #ifdef _WIN32 297 setmode(fileno(stdin), O_BINARY); 298 // ensure output lines end in LF instead of CRLF on windows 299 setmode(fileno(stdout), O_BINARY); 300 setmode(fileno(stderr), O_BINARY); 301 #endif 302 303 return run(argc, argv) == 0 ? 0 : 1; 304 }