File: coby.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2024 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 coby [files...] 27 28 29 COunt BYtes finds out some simple byte-related stats, counting 30 31 - bytes 32 - runes, which are UTF-8 code-points 33 - lines 34 - how many lines have trailing spaces 35 - how many lines end with a CRLF pair 36 - all-off (0) bytes 37 - all-on (255) bytes 38 - high-bytes (128+) 39 - which (if any) byte-order mark the data start with 40 41 The output is TSV (tab-separated values) lines, where the first line has 42 all the column names. 43 44 When no filepaths are given, the standard input is used by default. 45 */ 46 47 /* 48 You can build this command-line app by running 49 cc -Wall -s -O2 -o ./coby ./coby.c 50 */ 51 52 #include <fcntl.h> 53 #include <stdbool.h> 54 #include <stdint.h> 55 #include <stdio.h> 56 #include <stdlib.h> 57 #include <string.h> 58 59 #ifdef _WIN32 60 #include <windows.h> 61 #endif 62 63 const char* header = "" 64 "name\tbytes\trunes\tlines\tlf\tcrlf\tspaces\ttabs" 65 "\ttrails\tnulls\tfulls\thighs\tbom"; 66 67 enum { 68 no_bom = 0, 69 utf8_bom = 1, 70 utf16le_bom = 2, 71 utf16be_bom = 3, 72 utf32le_bom = 4, 73 utf32be_bom = 5, 74 }; 75 76 const char* bom_legend[] = { 77 "", 78 "UTF-8", 79 "UTF-16 LE", 80 "UTF-16 BE", 81 "UTF-32 LE", 82 "UTF-32 BE", 83 }; 84 85 // stats holds all byte-related counts this app deals with 86 typedef struct stats { 87 uint64_t bytes; // the total byte-count 88 uint64_t runes; // how many utf-8 items 89 uint64_t lines; // how many plain-text lines 90 uint64_t lf; // how many line-feeds 91 uint64_t crlf; // how many carriage-return/line-feed pairs 92 uint64_t spaces; // how many spaces 93 uint64_t tabs; // how many tabs 94 uint64_t trails; // how many plain-text lines with trailing spaces 95 uint64_t nulls; // how many all-bits-off bytes 96 uint64_t fulls; // how many all-bits-on bytes 97 uint64_t highs; // how many bytes with the highest-order bit on 98 uint64_t bom; // which (if any) kind of byte-order mark data start with 99 } stats; 100 101 uint64_t check_bom(unsigned char* data, size_t len) { 102 const unsigned char* d = data; 103 104 if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) { 105 return utf8_bom; 106 } 107 if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) { 108 return utf32le_bom; 109 } 110 if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) { 111 return utf32be_bom; 112 } 113 if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) { 114 return utf16le_bom; 115 } 116 if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) { 117 return utf16be_bom; 118 } 119 120 return no_bom; 121 } 122 123 // count_bytes gathers all sorts of byte-related stats, besides a total count 124 void count_bytes(FILE* r, stats* res) { 125 unsigned char buf[48 * 1024]; 126 uint64_t tally[256]; 127 128 uint64_t bytes = 0; 129 uint64_t crlf = 0; 130 uint64_t trails = 0; 131 uint64_t runes = 0; 132 133 unsigned char prev2 = 0; 134 unsigned char prev1 = 0; 135 136 memset(tally, 0, sizeof(tally)); 137 memset(res, 0, sizeof(stats)); 138 139 while (true) { 140 const size_t len = fread(buf, sizeof(unsigned char), sizeof(buf), r); 141 142 if (len < 1) { 143 break; 144 } 145 146 if (bytes == 0) { 147 res->bom = check_bom(buf, len); 148 } 149 bytes += len; 150 151 for (size_t i = 0; i < len; i++) { 152 const unsigned char cur = buf[i]; 153 tally[cur]++; 154 155 crlf += (prev1 == '\r') && (cur == '\n'); 156 trails += (cur == '\n') && 157 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r')); 158 runes += (cur & 0xc0) != 0x80; 159 160 prev2 = prev1; 161 prev1 = cur; 162 } 163 } 164 165 res->bytes = bytes; 166 res->crlf = crlf; 167 res->trails = trails; 168 res->runes = runes; 169 170 res->lines = tally['\n']; 171 res->lf = tally['\n']; 172 res->spaces = tally[' ']; 173 res->tabs = tally['\t']; 174 res->nulls = tally[0]; 175 res->fulls = tally[255]; 176 177 res->highs = 0; 178 for (size_t i = 128; i < 256; i++) { 179 res->highs += tally[i]; 180 } 181 182 // count last line for non-empty inputs not ending with a line-feed byte 183 if (res->bytes > 0 && prev1 != '\n') { 184 res->lines++; 185 } 186 187 // count last trail for inputs not ending with a line-feed byte 188 if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) { 189 res->trails++; 190 } 191 } 192 193 // handle_input gathers stats, and shows a TSV line out of the results 194 void handle_input(FILE* r, stats* res, char* name) { 195 // show the filename right away, to reassure users something's happening 196 printf("%s", name); 197 fflush(stdout); 198 199 count_bytes(r, res); 200 201 // show results as soon as they're available 202 printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t", 203 res->bytes, res->runes, res->lines, res->lf, res->crlf, res->spaces, 204 res->tabs,res->trails, res->nulls, res->fulls, res->highs); 205 printf("%s\n", bom_legend[res->bom]); 206 fflush(stdout); 207 } 208 209 // handle_file handles data from the filename given, and returns whether the 210 // file was opened successfully 211 bool handle_file(char* fname, stats* res) { 212 FILE* f = fopen(fname, "rb"); 213 if (f == NULL) { 214 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname); 215 return false; 216 } 217 218 handle_input(f, res, fname); 219 fclose(f); 220 return true; 221 } 222 223 // run returns the number of errors 224 size_t run(int argc, char** argv) { 225 size_t empty = 0; 226 size_t dashes = 0; 227 228 for (int i = 1; i < argc; i++) { 229 if (argv[i][0] == 0) { 230 empty++; 231 continue; 232 } 233 234 if (argv[i][0] == '-' && argv[i][1] == 0) { 235 dashes++; 236 } 237 } 238 239 if (dashes > 1) { 240 const char* msg = "can't use a dash (stdin) as input more than once"; 241 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 242 return 1; 243 } 244 245 // show header line right away, to reassure users the app is working 246 printf("%s\n", header); 247 fflush(stdout); 248 249 // if output is done, don't even bother doing anything 250 if (feof(stdout)) { 251 return 0; 252 } 253 254 // use stdin when not given any filepaths, or when all paths are empty 255 if (argc <= 1 || empty == argc - 1) { 256 stats res; 257 handle_input(stdin, &res, "-"); 258 return 0; 259 } 260 261 stats res; 262 size_t errors = 0; 263 264 for (int i = 1; i < argc; i++) { 265 // if output is done while being piped, quit right away 266 if (feof(stdout)) { 267 return errors; 268 } 269 270 // ignore empty names 271 if (argv[i][0] == 0) { 272 continue; 273 } 274 275 // handle `-` as stdin 276 if (argv[i][0] == '-' && argv[i][1] == 0) { 277 handle_input(stdin, &res, argv[i]); 278 continue; 279 } 280 281 errors += !handle_file(argv[i], &res); 282 } 283 284 return errors; 285 } 286 287 int main(int argc, char** argv) { 288 #ifdef _WIN32 289 setmode(fileno(stdin), O_BINARY); 290 // ensure output lines end in LF instead of CRLF on windows 291 setmode(fileno(stdout), O_BINARY); 292 setmode(fileno(stderr), O_BINARY); 293 #endif 294 295 // disable automatic stdio buffering, in favor of explicit buffering 296 setvbuf(stdin, NULL, _IONBF, 0); 297 setvbuf(stdout, NULL, _IONBF, 0); 298 setvbuf(stderr, NULL, _IONBF, 0); 299 300 return run(argc, argv) == 0 ? 0 : 1; 301 }