File: coby.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 coby [files...] 27 28 29 COunt BYtes finds out some simple byte-related stats, counting 30 31 - bytes 32 - runes, which are UTF-8 code-points 33 - lines 34 - how many lines have trailing spaces 35 - how many lines end with a CRLF pair 36 - all-off (0) bytes 37 - all-on (255) bytes 38 - high-bytes (128+) 39 - which (if any) byte-order mark the data start with 40 41 The output is TSV (tab-separated values) lines, where the first line has 42 all the column names. 43 44 When no filepaths are given, the standard input is used by default. 45 */ 46 47 /* 48 You can build this command-line app by running 49 50 cc -Wall -s -O2 -o ./coby ./coby.c 51 */ 52 53 #include <stdbool.h> 54 #include <stdint.h> 55 #include <stdio.h> 56 #include <stdlib.h> 57 #include <string.h> 58 59 #ifdef _WIN32 60 #include <fcntl.h> 61 #include <windows.h> 62 #endif 63 64 // const char* header = "" 65 // "name\tbytes\trunes\tlines\tlf\tcrlf\tspaces\ttabs" 66 // "\ttrails\tnulls\tfulls\thighs\tbom"; 67 68 const char* header = "" 69 "name\tbytes\tlines\tlf\tcrlf\tspaces\ttabs" 70 "\ttrails\tnulls\tfulls\thighs\tbom"; 71 72 enum { 73 no_bom = 0, 74 utf8_bom = 1, 75 utf16le_bom = 2, 76 utf16be_bom = 3, 77 utf32le_bom = 4, 78 utf32be_bom = 5, 79 }; 80 81 const char* bom_legend[] = { 82 "", 83 "UTF-8", 84 "UTF-16 LE", 85 "UTF-16 BE", 86 "UTF-32 LE", 87 "UTF-32 BE", 88 }; 89 90 // stats holds all byte-related counts this app deals with 91 typedef struct stats { 92 uint64_t bytes; // the total byte-count 93 uint64_t runes; // how many utf-8 items 94 uint64_t lines; // how many plain-text lines 95 uint64_t lf; // how many line-feeds 96 uint64_t crlf; // how many carriage-return/line-feed pairs 97 uint64_t spaces; // how many spaces 98 uint64_t tabs; // how many tabs 99 uint64_t trails; // how many plain-text lines with trailing spaces 100 uint64_t nulls; // how many all-bits-off bytes 101 uint64_t fulls; // how many all-bits-on bytes 102 uint64_t highs; // how many bytes with the highest-order bit on 103 uint64_t bom; // which (if any) kind of byte-order mark data start with 104 } stats; 105 106 uint64_t check_bom(unsigned char* data, size_t len) { 107 const unsigned char* d = data; 108 109 if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) { 110 return utf8_bom; 111 } 112 if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) { 113 return utf32le_bom; 114 } 115 if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) { 116 return utf32be_bom; 117 } 118 if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) { 119 return utf16le_bom; 120 } 121 if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) { 122 return utf16be_bom; 123 } 124 125 return no_bom; 126 } 127 128 // count_bytes gathers all sorts of byte-related stats, besides a total count 129 void count_bytes(FILE* r, stats* res) { 130 unsigned char buf[32 * 1024]; 131 uint64_t tally[256]; 132 133 uint64_t bytes = 0; 134 uint64_t crlf = 0; 135 uint64_t trails = 0; 136 uint64_t runes = 0; 137 138 unsigned char prev2 = 0; 139 unsigned char prev1 = 0; 140 141 memset(tally, 0, sizeof(tally)); 142 memset(res, 0, sizeof(stats)); 143 144 while (true) { 145 const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r); 146 147 if (len < 1) { 148 break; 149 } 150 151 if (bytes == 0) { 152 res->bom = check_bom(buf, len); 153 } 154 bytes += len; 155 156 for (size_t i = 0; i < len; i++) { 157 const unsigned char cur = buf[i]; 158 tally[cur]++; 159 160 crlf += (prev1 == '\r') && (cur == '\n'); 161 trails += (cur == '\n') && 162 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r')); 163 // runes += (cur & 0xc0) != 0x80; 164 165 prev2 = prev1; 166 prev1 = cur; 167 } 168 } 169 170 res->bytes = bytes; 171 res->crlf = crlf; 172 res->trails = trails; 173 res->runes = runes; 174 175 res->lines = tally['\n']; 176 res->lf = tally['\n']; 177 res->spaces = tally[' ']; 178 res->tabs = tally['\t']; 179 res->nulls = tally[0]; 180 res->fulls = tally[255]; 181 182 res->highs = 0; 183 for (size_t i = 128; i < 256; i++) { 184 res->highs += tally[i]; 185 } 186 187 // count last line for non-empty inputs not ending with a line-feed byte 188 if (res->bytes > 0 && prev1 != '\n') { 189 res->lines++; 190 } 191 192 // count last trail for inputs not ending with a line-feed byte 193 if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) { 194 res->trails++; 195 } 196 } 197 198 // handle_input gathers stats, and shows a TSV line out of the results 199 void handle_input(FILE* r, stats* res, const char* name) { 200 // show the filename right away, to reassure users something's happening 201 printf("%s", name); 202 fflush(stdout); 203 204 count_bytes(r, res); 205 206 // show results as soon as they're available 207 // printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t", 208 // res->bytes, res->runes, res->lines, res->lf, res->crlf, res->spaces, 209 // res->tabs,res->trails, res->nulls, res->fulls, res->highs); 210 // printf("%s\n", bom_legend[res->bom]); 211 212 // show results as soon as they're available 213 printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t", 214 res->bytes, res->lines, res->lf, res->crlf, res->spaces, 215 res->tabs,res->trails, res->nulls, res->fulls, res->highs); 216 printf("%s\n", bom_legend[res->bom]); 217 } 218 219 // handle_file handles data from the filename given, and returns whether the 220 // file was opened successfully 221 bool handle_file(const char* fname, stats* res) { 222 FILE* f = fopen(fname, "rb"); 223 if (f == NULL) { 224 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname); 225 return false; 226 } 227 228 handle_input(f, res, fname); 229 fclose(f); 230 return true; 231 } 232 233 // run returns the number of errors 234 int run(int argc, char** argv) { 235 size_t empty = 0; 236 size_t dashes = 0; 237 238 for (int i = 1; i < argc; i++) { 239 if (argv[i][0] == 0) { 240 empty++; 241 continue; 242 } 243 244 if (argv[i][0] == '-' && argv[i][1] == 0) { 245 dashes++; 246 } 247 } 248 249 if (dashes > 1) { 250 const char* msg = "can't use a dash (stdin) as input more than once"; 251 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 252 return 1; 253 } 254 255 // show header line right away, to reassure users the app is working 256 printf("%s\n", header); 257 258 // if output is done, don't even bother doing anything 259 if (feof(stdout)) { 260 return 0; 261 } 262 263 // use stdin when not given any filepaths, or when all paths are empty 264 if (argc <= 1 || empty == argc - 1) { 265 stats res; 266 handle_input(stdin, &res, "-"); 267 return 0; 268 } 269 270 stats res; 271 size_t errors = 0; 272 273 for (int i = 1; i < argc; i++) { 274 // if output is done while being piped, quit right away 275 if (feof(stdout)) { 276 return errors; 277 } 278 279 // ignore empty names 280 if (argv[i][0] == 0) { 281 continue; 282 } 283 284 // handle `-` as stdin 285 if (argv[i][0] == '-' && argv[i][1] == 0) { 286 handle_input(stdin, &res, argv[i]); 287 continue; 288 } 289 290 errors += !handle_file(argv[i], &res); 291 } 292 293 return errors; 294 } 295 296 int main(int argc, char** argv) { 297 #ifdef _WIN32 298 setmode(fileno(stdin), O_BINARY); 299 // ensure output lines end in LF instead of CRLF on windows 300 setmode(fileno(stdout), O_BINARY); 301 setmode(fileno(stderr), O_BINARY); 302 #endif 303 304 return run(argc, argv) == 0 ? 0 : 1; 305 }