File: coby.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2024 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 coby [files...] 27 28 29 COunt BYtes finds out some simple byte-related stats, counting 30 31 - bytes 32 - runes, which are UTF-8 code-points 33 - lines 34 - how many lines have trailing spaces 35 - how many lines end with a CRLF pair 36 - all-off (0) bytes 37 - all-on (255) bytes 38 - high-bytes (128+) 39 - which (if any) byte-order mark the data start with 40 41 The output is TSV (tab-separated values) lines, where the first line has 42 all the column names. 43 44 When no filepaths are given, the standard input is used by default. 45 */ 46 47 /* 48 You can build this command-line app by running 49 50 cc -Wall -s -O2 -o ./coby ./coby.c 51 */ 52 53 #include <fcntl.h> 54 #include <stdbool.h> 55 #include <stdint.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 #include <string.h> 59 60 #ifdef _WIN32 61 #include <windows.h> 62 #endif 63 64 const char* header = "" 65 "name\tbytes\trunes\tlines\tlf\tcrlf\tspaces\ttabs" 66 "\ttrails\tnulls\tfulls\thighs\tbom"; 67 68 enum { 69 no_bom = 0, 70 utf8_bom = 1, 71 utf16le_bom = 2, 72 utf16be_bom = 3, 73 utf32le_bom = 4, 74 utf32be_bom = 5, 75 }; 76 77 const char* bom_legend[] = { 78 "", 79 "UTF-8", 80 "UTF-16 LE", 81 "UTF-16 BE", 82 "UTF-32 LE", 83 "UTF-32 BE", 84 }; 85 86 // stats holds all byte-related counts this app deals with 87 typedef struct stats { 88 uint64_t bytes; // the total byte-count 89 uint64_t runes; // how many utf-8 items 90 uint64_t lines; // how many plain-text lines 91 uint64_t lf; // how many line-feeds 92 uint64_t crlf; // how many carriage-return/line-feed pairs 93 uint64_t spaces; // how many spaces 94 uint64_t tabs; // how many tabs 95 uint64_t trails; // how many plain-text lines with trailing spaces 96 uint64_t nulls; // how many all-bits-off bytes 97 uint64_t fulls; // how many all-bits-on bytes 98 uint64_t highs; // how many bytes with the highest-order bit on 99 uint64_t bom; // which (if any) kind of byte-order mark data start with 100 } stats; 101 102 uint64_t check_bom(unsigned char* data, size_t len) { 103 const unsigned char* d = data; 104 105 if (len >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) { 106 return utf8_bom; 107 } 108 if (len >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0) { 109 return utf32le_bom; 110 } 111 if (len >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff) { 112 return utf32be_bom; 113 } 114 if (len >= 2 && data[0] == 0xff && data[1] == 0xfe) { 115 return utf16le_bom; 116 } 117 if (len >= 2 && data[0] == 0xfe && data[1] == 0xff) { 118 return utf16be_bom; 119 } 120 121 return no_bom; 122 } 123 124 // count_bytes gathers all sorts of byte-related stats, besides a total count 125 void count_bytes(FILE* r, stats* res) { 126 unsigned char buf[32 * 1024]; 127 uint64_t tally[256]; 128 129 uint64_t bytes = 0; 130 uint64_t crlf = 0; 131 uint64_t trails = 0; 132 uint64_t runes = 0; 133 134 unsigned char prev2 = 0; 135 unsigned char prev1 = 0; 136 137 memset(tally, 0, sizeof(tally)); 138 memset(res, 0, sizeof(stats)); 139 140 while (true) { 141 const size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r); 142 143 if (len < 1) { 144 break; 145 } 146 147 if (bytes == 0) { 148 res->bom = check_bom(buf, len); 149 } 150 bytes += len; 151 152 for (size_t i = 0; i < len; i++) { 153 const unsigned char cur = buf[i]; 154 tally[cur]++; 155 156 crlf += (prev1 == '\r') && (cur == '\n'); 157 trails += (cur == '\n') && 158 ((prev1 == ' ') || (prev2 == ' ' && prev1 == '\r')); 159 runes += (cur & 0xc0) != 0x80; 160 161 prev2 = prev1; 162 prev1 = cur; 163 } 164 } 165 166 res->bytes = bytes; 167 res->crlf = crlf; 168 res->trails = trails; 169 res->runes = runes; 170 171 res->lines = tally['\n']; 172 res->lf = tally['\n']; 173 res->spaces = tally[' ']; 174 res->tabs = tally['\t']; 175 res->nulls = tally[0]; 176 res->fulls = tally[255]; 177 178 res->highs = 0; 179 for (size_t i = 128; i < 256; i++) { 180 res->highs += tally[i]; 181 } 182 183 // count last line for non-empty inputs not ending with a line-feed byte 184 if (res->bytes > 0 && prev1 != '\n') { 185 res->lines++; 186 } 187 188 // count last trail for inputs not ending with a line-feed byte 189 if (prev1 == ' ' || (prev2 == ' ' && prev1 == '\r')) { 190 res->trails++; 191 } 192 } 193 194 // handle_input gathers stats, and shows a TSV line out of the results 195 void handle_input(FILE* r, stats* res, char* name) { 196 // show the filename right away, to reassure users something's happening 197 printf("%s", name); 198 fflush(stdout); 199 200 count_bytes(r, res); 201 202 // show results as soon as they're available 203 printf("\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t", 204 res->bytes, res->runes, res->lines, res->lf, res->crlf, res->spaces, 205 res->tabs,res->trails, res->nulls, res->fulls, res->highs); 206 printf("%s\n", bom_legend[res->bom]); 207 fflush(stdout); 208 } 209 210 // handle_file handles data from the filename given, and returns whether the 211 // file was opened successfully 212 bool handle_file(char* fname, stats* res) { 213 FILE* f = fopen(fname, "rb"); 214 if (f == NULL) { 215 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname); 216 return false; 217 } 218 219 handle_input(f, res, fname); 220 fclose(f); 221 return true; 222 } 223 224 // run returns the number of errors 225 int run(int argc, char** argv) { 226 size_t empty = 0; 227 size_t dashes = 0; 228 229 for (int i = 1; i < argc; i++) { 230 if (argv[i][0] == 0) { 231 empty++; 232 continue; 233 } 234 235 if (argv[i][0] == '-' && argv[i][1] == 0) { 236 dashes++; 237 } 238 } 239 240 if (dashes > 1) { 241 const char* msg = "can't use a dash (stdin) as input more than once"; 242 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 243 return 1; 244 } 245 246 // show header line right away, to reassure users the app is working 247 printf("%s\n", header); 248 fflush(stdout); 249 250 // if output is done, don't even bother doing anything 251 if (feof(stdout)) { 252 return 0; 253 } 254 255 // use stdin when not given any filepaths, or when all paths are empty 256 if (argc <= 1 || empty == argc - 1) { 257 stats res; 258 handle_input(stdin, &res, "-"); 259 return 0; 260 } 261 262 stats res; 263 size_t errors = 0; 264 265 for (int i = 1; i < argc; i++) { 266 // if output is done while being piped, quit right away 267 if (feof(stdout)) { 268 return errors; 269 } 270 271 // ignore empty names 272 if (argv[i][0] == 0) { 273 continue; 274 } 275 276 // handle `-` as stdin 277 if (argv[i][0] == '-' && argv[i][1] == 0) { 278 handle_input(stdin, &res, argv[i]); 279 continue; 280 } 281 282 errors += !handle_file(argv[i], &res); 283 } 284 285 return errors; 286 } 287 288 int main(int argc, char** argv) { 289 #ifdef _WIN32 290 setmode(fileno(stdin), O_BINARY); 291 // ensure output lines end in LF instead of CRLF on windows 292 setmode(fileno(stdout), O_BINARY); 293 setmode(fileno(stderr), O_BINARY); 294 #endif 295 296 return run(argc, argv) == 0 ? 0 : 1; 297 }