File: coby.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 coby [files...] 27 28 29 COunt BYtes finds out some simple byte-related stats, counting 30 31 - bytes 32 - lines 33 - how many lines have trailing spaces 34 - how many lines end with a CRLF pair 35 - all-off (0) bytes 36 - all-on (255) bytes 37 - high-bytes (128+) 38 - which (if any) byte-order mark the data start with 39 40 The output is TSV (tab-separated values) lines, where the first line has 41 all the column names. 42 43 When no filepaths are given, the standard input is used by default. 44 */ 45 46 /* 47 You can build this command-line app by running 48 49 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./coby ./coby.c 50 */ 51 52 #include <stdbool.h> 53 #include <stdint.h> 54 #include <stdio.h> 55 #include <stdlib.h> 56 #include <string.h> 57 58 #ifdef _WIN32 59 #include <fcntl.h> 60 #include <windows.h> 61 #endif 62 63 #ifdef RED_ERRORS 64 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 65 #ifdef __APPLE__ 66 #define ERROR_STYLE "\x1b[31m" 67 #endif 68 #define RESET_STYLE "\x1b[0m" 69 #else 70 #define ERROR_STYLE 71 #define RESET_STYLE 72 #endif 73 74 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 75 76 #ifndef IBUF_SIZE 77 #define IBUF_SIZE (32 * 1024) 78 #endif 79 80 const char* header = "" 81 "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs" 82 "\ttrails\tnulls\tfulls\thighs\tbom"; 83 84 enum { 85 no_bom = 0, 86 utf8_bom = 1, 87 utf16le_bom = 2, 88 utf16be_bom = 3, 89 utf32le_bom = 4, 90 utf32be_bom = 5, 91 }; 92 93 const char* bom_legend[] = { 94 "", 95 "UTF-8", 96 "UTF-16 LE", 97 "UTF-16 BE", 98 "UTF-32 LE", 99 "UTF-32 BE", 100 }; 101 102 // stats holds all byte-related counts this app deals with 103 typedef struct stats { 104 uint64_t bytes; // the total byte-count 105 uint64_t lines; // how many plain-text lines 106 uint64_t lf; // how many line-feeds 107 uint64_t crlf; // how many carriage-return/line-feed pairs 108 uint64_t spaces; // how many spaces 109 uint64_t tabs; // how many tabs 110 uint64_t trails; // how many plain-text lines with trailing spaces 111 uint64_t nulls; // how many all-bits-off bytes 112 uint64_t fulls; // how many all-bits-on bytes 113 uint64_t highs; // how many bytes with the highest-order bit on 114 uint64_t bom; // which (if any) kind of byte-order mark data start with 115 } stats; 116 117 uint64_t check_bom(unsigned char* data, size_t len) { 118 const unsigned char* d = data; 119 120 if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) { 121 return utf8_bom; 122 } 123 if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) { 124 return utf32le_bom; 125 } 126 if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) { 127 return utf32be_bom; 128 } 129 if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) { 130 return utf16le_bom; 131 } 132 if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) { 133 return utf16be_bom; 134 } 135 136 return no_bom; 137 } 138 139 // count_bytes gathers all sorts of byte-related stats, besides a total count 140 void count_bytes(FILE* r, stats* res) { 141 unsigned char buf[IBUF_SIZE]; 142 uint64_t tally[256]; 143 144 uint64_t bytes = 0; 145 uint64_t crlf = 0; 146 uint64_t trails = 0; 147 148 unsigned char prev2 = 0; 149 unsigned char prev1 = 0; 150 151 memset(tally, 0, sizeof(tally)); 152 memset(res, 0, sizeof(stats)); 153 154 while (true) { 155 const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r); 156 157 if (len < 1) { 158 break; 159 } 160 161 if (bytes == 0) { 162 res->bom = check_bom(buf, len); 163 } 164 bytes += len; 165 166 for (size_t i = 0; i < len; i++) { 167 const unsigned char cur = buf[i]; 168 tally[cur]++; 169 170 crlf += (prev1 == '\r') && (cur == '\n'); 171 trails += (cur == '\n') && 172 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r')); 173 174 prev2 = prev1; 175 prev1 = cur; 176 } 177 } 178 179 res->bytes = bytes; 180 res->crlf = crlf; 181 res->trails = trails; 182 183 res->lines = tally['\n']; 184 res->lf = tally['\n']; 185 res->spaces = tally[' ']; 186 res->tabs = tally['\t']; 187 res->nulls = tally[0]; 188 res->fulls = tally[255]; 189 190 res->highs = 0; 191 for (size_t i = 128; i < 256; i++) { 192 res->highs += tally[i]; 193 } 194 195 // count last line for non-empty inputs not ending with a line-feed byte 196 if (res->bytes > 0 && prev1 != '\n') { 197 res->lines++; 198 } 199 200 // count last trail for inputs not ending with a line-feed byte 201 if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) { 202 res->trails++; 203 } 204 } 205 206 // handle_input gathers stats, and shows a TSV line out of the results 207 void handle_input(FILE* r, stats* res, const char* name) { 208 // show the filename right away, to reassure users something's happening 209 printf("%s", name); 210 fflush(stdout); 211 212 count_bytes(r, res); 213 214 // show results as soon as they're available 215 printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu", 216 res->bytes, res->lines, res->lf, res->crlf, res->spaces, 217 res->tabs,res->trails, res->nulls, res->fulls, res->highs); 218 printf("\t%s\n", bom_legend[res->bom]); 219 fflush(stdout); 220 } 221 222 // handle_file handles data from the filename given, and returns whether the 223 // file was opened successfully 224 bool handle_file(const char* path, stats* res) { 225 FILE* f = fopen(path, "rb"); 226 if (f == NULL) { 227 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 228 return false; 229 } 230 231 handle_input(f, res, path); 232 fclose(f); 233 return true; 234 } 235 236 // run returns the number of errors 237 int run(int argc, char** argv) { 238 size_t empty = 0; 239 size_t dashes = 0; 240 241 for (int i = 1; i < argc; i++) { 242 if (argv[i][0] == 0) { 243 empty++; 244 continue; 245 } 246 247 if (argv[i][0] == '-' && argv[i][1] == 0) { 248 dashes++; 249 } 250 } 251 252 if (dashes > 1) { 253 const char* msg = "can't use a dash (stdin) as input more than once"; 254 fprintf(stderr, ERROR_LINE("%s"), msg); 255 return 1; 256 } 257 258 // show header line right away, to reassure users the app is working 259 printf("%s\n", header); 260 261 // if output is done, don't even bother doing anything 262 if (feof(stdout)) { 263 return 0; 264 } 265 266 // use stdin when not given any filepaths, or when all paths are empty 267 if (argc <= 1 || empty == argc - 1) { 268 stats res; 269 handle_input(stdin, &res, "-"); 270 return 0; 271 } 272 273 stats res; 274 size_t errors = 0; 275 276 for (int i = 1; i < argc; i++) { 277 // if output is done while being piped, quit right away 278 if (feof(stdout)) { 279 return errors; 280 } 281 282 // ignore empty names 283 if (argv[i][0] == 0) { 284 continue; 285 } 286 287 // handle `-` as stdin 288 if (argv[i][0] == '-' && argv[i][1] == 0) { 289 handle_input(stdin, &res, argv[i]); 290 continue; 291 } 292 293 errors += !handle_file(argv[i], &res); 294 } 295 296 return errors; 297 } 298 299 int main(int argc, char** argv) { 300 #ifdef _WIN32 301 setmode(fileno(stdin), O_BINARY); 302 // ensure output lines end in LF instead of CRLF on windows 303 setmode(fileno(stdout), O_BINARY); 304 setmode(fileno(stderr), O_BINARY); 305 #endif 306 307 return run(argc, argv) == 0 ? 0 : 1; 308 }