File: coby.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 coby [files...] 27 28 29 COunt BYtes finds out some simple byte-related stats, counting 30 31 - bytes 32 - lines 33 - how many lines have trailing spaces 34 - how many lines end with a CRLF pair 35 - all-off (0) bytes 36 - all-on (255) bytes 37 - high-bytes (128+) 38 - which (if any) byte-order mark the data start with 39 40 The output is TSV (tab-separated values) lines, where the first line has 41 all the column names. 42 43 When no filepaths are given, the standard input is used by default. 44 */ 45 46 /* 47 You can build this command-line app by running 48 49 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./coby ./coby.c 50 */ 51 52 #include <stdbool.h> 53 #include <stdint.h> 54 #include <stdio.h> 55 #include <stdlib.h> 56 #include <string.h> 57 58 #ifdef _WIN32 59 #include <fcntl.h> 60 #include <windows.h> 61 #endif 62 63 #ifdef RED_ERRORS 64 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 65 #ifdef __APPLE__ 66 #define ERROR_STYLE "\x1b[31m" 67 #endif 68 #define RESET_STYLE "\x1b[0m" 69 #else 70 #define ERROR_STYLE 71 #define RESET_STYLE 72 #endif 73 74 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 75 76 #ifndef IBUF_SIZE 77 #define IBUF_SIZE (32 * 1024) 78 #endif 79 80 const char* header = "" 81 "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs" 82 "\ttrails\tnulls\tfulls\thighs\tbom"; 83 84 enum { 85 no_bom = 0, 86 utf8_bom = 1, 87 utf16le_bom = 2, 88 utf16be_bom = 3, 89 utf32le_bom = 4, 90 utf32be_bom = 5, 91 }; 92 93 const char* bom_legend[] = { 94 "", 95 "UTF-8", 96 "UTF-16 LE", 97 "UTF-16 BE", 98 "UTF-32 LE", 99 "UTF-32 BE", 100 }; 101 102 // stats holds all byte-related counts this app deals with 103 typedef struct stats { 104 uint64_t bytes; // the total byte-count 105 uint64_t lines; // how many plain-text lines 106 uint64_t lf; // how many line-feeds 107 uint64_t crlf; // how many carriage-return/line-feed pairs 108 uint64_t spaces; // how many spaces 109 uint64_t tabs; // how many tabs 110 uint64_t trails; // how many plain-text lines with trailing spaces 111 uint64_t nulls; // how many all-bits-off bytes 112 uint64_t fulls; // how many all-bits-on bytes 113 uint64_t highs; // how many bytes with the highest-order bit on 114 uint64_t bom; // which (if any) kind of byte-order mark data start with 115 } stats; 116 117 uint64_t check_bom(unsigned char* data, size_t len) { 118 const unsigned char* d = data; 119 120 if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) { 121 return utf8_bom; 122 } 123 if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) { 124 return utf32le_bom; 125 } 126 if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) { 127 return utf32be_bom; 128 } 129 if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) { 130 return utf16le_bom; 131 } 132 if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) { 133 return utf16be_bom; 134 } 135 136 return no_bom; 137 } 138 139 // count_bytes gathers all sorts of byte-related stats, besides a total count 140 void count_bytes(FILE* r, stats* res) { 141 unsigned char buf[IBUF_SIZE]; 142 uint64_t tally[256]; 143 144 uint64_t bytes = 0; 145 uint64_t crlf = 0; 146 uint64_t trails = 0; 147 148 unsigned char prev2 = 0; 149 unsigned char prev1 = 0; 150 unsigned char cur = 0; 151 152 memset(tally, 0, sizeof(tally)); 153 memset(res, 0, sizeof(stats)); 154 155 while (true) { 156 const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r); 157 if (len < 1) { 158 break; 159 } 160 161 if (bytes == 0) { 162 res->bom = check_bom(buf, len); 163 } 164 bytes += len; 165 166 for (size_t i = 0; i < len; i++) { 167 cur = buf[i]; 168 tally[cur]++; 169 if (cur == '\n') { 170 if (prev1 == ' ') { 171 trails++; 172 } else if (prev1 == '\r') { 173 crlf++; 174 if (prev2 == ' ') { 175 trails++; 176 } 177 } 178 } 179 180 prev2 = prev1; 181 prev1 = cur; 182 } 183 } 184 185 if (cur == ' ') { 186 trails++; 187 } else if (cur == '\r') { 188 crlf++; 189 if (prev1 == ' ') { 190 trails++; 191 } 192 } 193 194 res->bytes = bytes; 195 res->crlf = crlf; 196 res->trails = trails; 197 198 res->lines = tally['\n']; 199 res->lf = tally['\n']; 200 res->spaces = tally[' ']; 201 res->tabs = tally['\t']; 202 res->nulls = tally[0]; 203 res->fulls = tally[255]; 204 205 res->highs = 0; 206 for (size_t i = 128; i < 256; i++) { 207 res->highs += tally[i]; 208 } 209 210 // count last line for non-empty inputs not ending with a line-feed byte 211 if (res->bytes > 0 && prev1 != '\n') { 212 res->lines++; 213 } 214 215 // count last trail for inputs not ending with a line-feed byte 216 if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) { 217 res->trails++; 218 } 219 } 220 221 // handle_input gathers stats, and shows a TSV line out of the results 222 void handle_input(FILE* r, stats* res, const char* name) { 223 // show the filename right away, to reassure users something's happening 224 printf("%s", name); 225 fflush(stdout); 226 227 count_bytes(r, res); 228 229 // show results as soon as they're available 230 printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu", 231 res->bytes, res->lines, res->lf, res->crlf, res->spaces, 232 res->tabs,res->trails, res->nulls, res->fulls, res->highs); 233 printf("\t%s\n", bom_legend[res->bom]); 234 fflush(stdout); 235 } 236 237 // handle_file handles data from the filename given, and returns whether the 238 // file was opened successfully 239 bool handle_file(const char* path, stats* res) { 240 FILE* f = fopen(path, "rb"); 241 if (f == NULL) { 242 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 243 return false; 244 } 245 246 handle_input(f, res, path); 247 fclose(f); 248 return true; 249 } 250 251 // run returns the number of errors 252 int run(int argc, char** argv) { 253 size_t empty = 0; 254 size_t dashes = 0; 255 256 for (int i = 1; i < argc; i++) { 257 if (argv[i][0] == 0) { 258 empty++; 259 continue; 260 } 261 262 if (argv[i][0] == '-' && argv[i][1] == 0) { 263 dashes++; 264 } 265 } 266 267 if (dashes > 1) { 268 const char* msg = "can't use a dash (stdin) as input more than once"; 269 fprintf(stderr, ERROR_LINE("%s"), msg); 270 return 1; 271 } 272 273 // show header line right away, to reassure users the app is working 274 printf("%s\n", header); 275 276 // if output is done, don't even bother doing anything 277 if (feof(stdout)) { 278 return 0; 279 } 280 281 // use stdin when not given any filepaths, or when all paths are empty 282 if (argc <= 1 || empty == argc - 1) { 283 stats res; 284 handle_input(stdin, &res, "-"); 285 return 0; 286 } 287 288 stats res; 289 size_t errors = 0; 290 291 for (int i = 1; i < argc; i++) { 292 // if output is done while being piped, quit right away 293 if (feof(stdout)) { 294 return errors; 295 } 296 297 // ignore empty names 298 if (argv[i][0] == 0) { 299 continue; 300 } 301 302 // handle `-` as stdin 303 if (argv[i][0] == '-' && argv[i][1] == 0) { 304 handle_input(stdin, &res, argv[i]); 305 continue; 306 } 307 308 errors += !handle_file(argv[i], &res); 309 } 310 311 return errors; 312 } 313 314 int main(int argc, char** argv) { 315 #ifdef _WIN32 316 setmode(fileno(stdin), O_BINARY); 317 // ensure output lines end in LF instead of CRLF on windows 318 setmode(fileno(stdout), O_BINARY); 319 setmode(fileno(stderr), O_BINARY); 320 #endif 321 322 return run(argc, argv) == 0 ? 0 : 1; 323 }