File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath coby.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "errors" 37 "io" 38 "io/fs" 39 "os" 40 "path/filepath" 41 "runtime" 42 "strconv" 43 "sync" 44 ) 45 46 const info = ` 47 coby [options...] [files/folders...] 48 49 50 COunt BYtes finds out some simple byte-related stats, counting 51 52 - bytes 53 - lines 54 - how many lines have trailing spaces (trails) 55 - how many lines end with a CRLF pair 56 - all-bits-off (null) bytes 57 - all-bits-on (full) bytes 58 - top-bit-on (high) bytes 59 - which unicode byte-order-mark (bom) sequence the data start with 60 61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text 62 data, and thus may not be meaningful for general binary data. 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. A mix 69 of files/folders is supported for convenience. 70 71 The only option available is to show this help message, using any of 72 "-h", "--h", "-help", or "--help", without the quotes. 73 ` 74 75 // header has all the values for the first output line 76 var header = []string{ 77 `name`, 78 `bytes`, 79 `lines`, 80 `lf`, 81 `crlf`, 82 `spaces`, 83 `tabs`, 84 `trails`, 85 `nulls`, 86 `fulls`, 87 `highs`, 88 `bom`, 89 } 90 91 // event has what the output-reporting task needs to show the results of a 92 // task which has just completed, perhaps unsuccessfully 93 type event struct { 94 // Index points to the task's entry in the results-slice 95 Index int 96 97 // Stats has all the byte-related stats 98 Stats stats 99 100 // Err is the completed task's error, or lack of 101 Err error 102 } 103 104 func main() { 105 args := os.Args[1:] 106 107 if len(args) > 0 { 108 switch args[0] { 109 case `-h`, `--h`, `-help`, `--help`: 110 os.Stderr.WriteString(info[1:]) 111 return 112 113 case `--`: 114 args = args[1:] 115 } 116 } 117 118 // show first/heading line right away, to let users know things are 119 // happening 120 for i, s := range header { 121 if i > 0 { 122 os.Stdout.WriteString("\t") 123 } 124 os.Stdout.WriteString(s) 125 } 126 // assume an error means later stages/apps in a pipe had enough input and 127 // quit successfully, so quit successfully too 128 _, err := os.Stdout.WriteString("\n") 129 if err != nil { 130 return 131 } 132 133 // names has all filepaths given, ignoring repetitions 134 names, ok := findAllFiles(deduplicate(args)) 135 if !ok { 136 os.Exit(1) 137 } 138 if len(names) == 0 { 139 names = []string{`-`} 140 } 141 142 events := make(chan event) 143 go handleInputs(names, events) 144 if !handleOutput(os.Stdout, len(names), events) { 145 os.Exit(1) 146 } 147 } 148 149 // handleInputs launches all the tasks which do the actual work, limiting how 150 // many inputs are being worked on at the same time 151 func handleInputs(names []string, events chan event) { 152 // allow output-reporter task to end, and thus the app 153 defer close(events) 154 155 // permissions limits how many worker tasks can be active at the same 156 // time: when given many filepaths to work on, rate-limiting avoids 157 // a massive number of concurrent tasks which read and process input 158 permissions := make(chan struct{}, runtime.NumCPU()) 159 defer close(permissions) 160 161 var inputs sync.WaitGroup 162 for i := range names { 163 // wait until some concurrency-room is available 164 permissions <- struct{}{} 165 inputs.Add(1) 166 167 go func(i int) { 168 defer inputs.Done() 169 defer func() { <-permissions }() 170 res, err := handleInput(names[i]) 171 events <- event{i, res, err} 172 }(i) 173 } 174 175 // wait for all inputs, before closing the `events` channel 176 inputs.Wait() 177 } 178 179 // handleInput handles each work-item for func handleInputs 180 func handleInput(path string) (stats, error) { 181 var res stats 182 res.name = path 183 184 if path == `-` { 185 err := res.updateStats(os.Stdin) 186 return res, err 187 } 188 189 f, err := os.Open(path) 190 if err != nil { 191 res.result = resultError 192 // on windows, file-not-found error messages may mention `CreateFile`, 193 // even when trying to open files in read-only mode 194 return res, errors.New(`can't open file named ` + path) 195 } 196 defer f.Close() 197 198 err = res.updateStats(f) 199 return res, err 200 } 201 202 // handleOutput asynchronously updates output as results are known, whether 203 // it's errors or successful results; returns whether it succeeded, which 204 // means no errors happened 205 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) { 206 ok = true 207 bw := bufio.NewWriter(w) 208 defer bw.Flush() 209 210 results := make([]stats, rescount) 211 212 // keep track of which tasks are over, so that on each event all leading 213 // results which are ready are shown: all of this ensures prompt output 214 // updates as soon as results come in, while keeping the original order 215 // of the names/filepaths given 216 resultsLeft := results 217 218 for v := range events { 219 results[v.Index] = v.Stats 220 if v.Err != nil { 221 ok = false 222 bw.Flush() 223 showError(v.Err) 224 225 // stay in the current loop, in case this failure was keeping 226 // previous successes from showing up 227 } 228 229 n := countLeadingReady(resultsLeft) 230 231 for _, res := range resultsLeft[:n] { 232 if err := showResult(bw, res); err != nil { 233 // assume later stages/apps in a pipe had enough input and 234 // quit successfully, so quit successfully too 235 return true 236 } 237 } 238 resultsLeft = resultsLeft[n:] 239 240 // flush output-buffer only if anything new was shown 241 if n > 0 { 242 bw.Flush() 243 } 244 } 245 246 return ok 247 } 248 249 func showError(err error) { 250 os.Stderr.WriteString(err.Error()) 251 os.Stderr.WriteString("\n") 252 } 253 254 // showResult does what it says 255 func showResult(w *bufio.Writer, s stats) error { 256 if s.result == resultError { 257 return nil 258 } 259 260 var buf [64]byte 261 w.WriteString(s.name) 262 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10)) 263 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10)) 264 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10)) 265 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10)) 266 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10)) 267 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10)) 268 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10)) 269 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10)) 270 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10)) 271 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10)) 272 w.WriteByte('\t') 273 w.WriteString(bomLegend[s.bom]) 274 return w.WriteByte('\n') 275 } 276 277 // deduplicate avoids repeating items, keeping the original slice unchanged 278 func deduplicate(src []string) []string { 279 var unique []string 280 got := make(map[string]struct{}) 281 282 for _, s := range src { 283 if _, ok := got[s]; ok { 284 continue 285 } 286 287 unique = append(unique, s) 288 got[s] = struct{}{} 289 } 290 291 return unique 292 } 293 294 // findAllFiles can be given a mix of file/folder paths, finding all files 295 // recursively in folders, avoiding duplicates 296 func findAllFiles(paths []string) (found []string, ok bool) { 297 res := make(chan any) 298 var all sync.WaitGroup 299 all.Add(1) 300 301 go func() { 302 defer all.Done() 303 got := make(map[string]struct{}) 304 ok = true 305 306 for v := range res { 307 if err, ok := v.(error); ok { 308 showError(err) 309 ok = false 310 continue 311 } 312 313 s, ok := v.(string) 314 if !ok { 315 showError(errors.New(`value is neither string nor error`)) 316 ok = false 317 continue 318 } 319 320 if _, ok := got[s]; ok { 321 continue 322 } 323 324 got[s] = struct{}{} 325 found = append(found, s) 326 } 327 }() 328 329 rec := func(path string, info fs.DirEntry, err error) error { 330 if err != nil { 331 res <- err 332 return err 333 } 334 335 if info.IsDir() { 336 return nil 337 } 338 339 res <- path 340 return nil 341 } 342 343 for _, s := range paths { 344 // a dash means standard input 345 if s == `-` { 346 res <- s 347 continue 348 } 349 350 info, err := os.Stat(s) 351 if os.IsNotExist(err) { 352 // on windows, file-not-found messages may mention `CreateFile`, 353 // even when trying to open files in read-only mode 354 res <- errors.New(`can't find file/folder named ` + s) 355 continue 356 } 357 358 if err != nil { 359 res <- err 360 continue 361 } 362 363 if !info.IsDir() { 364 res <- s 365 continue 366 } 367 368 if err := filepath.WalkDir(s, rec); err != nil { 369 res <- err 370 } 371 } 372 373 close(res) 374 all.Wait() 375 376 return found, ok 377 } 378 379 // counter makes it easy to change the int-size of almost all counters 380 type counter uint64 381 382 // statResult constrains possible result-states/values in type stats 383 type statResult int 384 385 const ( 386 // resultPending is the default not-yet-ready result-status 387 resultPending = statResult(0) 388 389 // resultError means result should show as an error, instead of data 390 resultError = statResult(1) 391 392 // resultSuccess means a result's stats are ready to show 393 resultSuccess = statResult(2) 394 ) 395 396 // bomType is the type for the byte-order-mark enumeration 397 type bomType int 398 399 const ( 400 noBOM = bomType(0) 401 utf8BOM = bomType(1) 402 utf16leBOM = bomType(2) 403 utf16beBOM = bomType(3) 404 utf32leBOM = bomType(4) 405 utf32beBOM = bomType(5) 406 ) 407 408 // bomLegend has the string-equivalents of the bomType constants 409 var bomLegend = []string{ 410 ``, 411 `UTF-8`, 412 `UTF-16 LE`, 413 `UTF-16 BE`, 414 `UTF-32 LE`, 415 `UTF-32 BE`, 416 } 417 418 // stats has all the size-stats for some input, as well as a way to 419 // skip showing results, in case of an error such as `file not found` 420 type stats struct { 421 // bytes counts all bytes read 422 bytes counter 423 424 // lines counts lines, and is 0 only when the byte-count is also 0 425 lines counter 426 427 // maxWidth is maximum byte-width of lines, excluding carriage-returns 428 // and/or line-feeds 429 maxWidth counter 430 431 // nulls counts all-bits-off bytes 432 nulls counter 433 434 // fulls counts all-bits-on bytes 435 fulls counter 436 437 // highs counts bytes with their `top` (highest-order) bit on 438 highs counter 439 440 // spaces counts ASCII spaces 441 spaces counter 442 443 // tabs counts ASCII tabs 444 tabs counter 445 446 // trailing counts lines with trailing spaces in them 447 trailing counter 448 449 // lf counts ASCII line-feeds as their own byte-values: this means its 450 // value will always be at least the same as field `crlf` 451 lf counter 452 453 // crlf counts ASCII CRLF byte-pairs 454 crlf counter 455 456 // the type of byte-order mark detected 457 bom bomType 458 459 // name is the filepath of the file/source these stats are about 460 name string 461 462 // results keeps track of whether results are valid and/or ready 463 result statResult 464 } 465 466 // updateStats does what it says, reading everything from a reader 467 func (res *stats) updateStats(r io.Reader) error { 468 err := res.updateUsing(r) 469 if err == io.EOF { 470 err = nil 471 } 472 473 if err == nil { 474 res.result = resultSuccess 475 } else { 476 res.result = resultError 477 } 478 return err 479 } 480 481 func checkBOM(data []byte) bomType { 482 d := data 483 l := len(data) 484 485 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 486 return utf8BOM 487 } 488 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 489 return utf32leBOM 490 } 491 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 492 return utf32beBOM 493 } 494 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 495 return utf16leBOM 496 } 497 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 498 return utf16beBOM 499 } 500 501 return noBOM 502 } 503 504 // updateUsing helps func updateStats do its job 505 func (res *stats) updateUsing(r io.Reader) error { 506 var buf [32 * 1024]byte 507 var tallies [256]uint64 508 509 var width counter 510 var prev1, prev2 byte 511 512 for { 513 n, err := r.Read(buf[:]) 514 if n < 1 { 515 res.lines = counter(tallies['\n']) 516 res.tabs = counter(tallies['\t']) 517 res.spaces = counter(tallies[' ']) 518 res.lf = counter(tallies['\n']) 519 res.nulls = counter(tallies[0]) 520 res.fulls = counter(tallies[255]) 521 for i := 128; i < 256; i++ { 522 res.highs += counter(tallies[i]) 523 } 524 525 if err == io.EOF { 526 return res.handleEnd(width, prev1, prev2) 527 } 528 return err 529 } 530 531 chunk := buf[:n] 532 if res.bytes == 0 { 533 res.bom = checkBOM(chunk) 534 } 535 res.bytes += counter(n) 536 537 for _, b := range chunk { 538 // count values without branching, because it's fun 539 tallies[b]++ 540 541 if b != '\n' { 542 prev2 = prev1 543 prev1 = b 544 width++ 545 continue 546 } 547 548 // handle line-feeds 549 550 crlf := count(prev1, '\r') 551 res.crlf += crlf 552 553 // count lines with trailing spaces, whether these end with 554 // a CRLF byte-pair or just a line-feed byte 555 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 556 res.trailing++ 557 } 558 559 // exclude any CR from the current line's width-count 560 width -= crlf 561 if res.maxWidth < width { 562 res.maxWidth = width 563 } 564 565 prev2 = prev1 566 prev1 = b 567 width = 0 568 } 569 } 570 } 571 572 // handleEnd fixes/finalizes stats when input data end; this func is only 573 // meant to be used by func updateStats, since it takes some of the latter's 574 // local variables 575 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 576 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 577 res.trailing++ 578 } 579 580 if res.maxWidth < width { 581 res.maxWidth = width 582 } 583 584 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 585 // standard cmd-line tool `wc` 586 if res.bytes > 0 && prev1 != '\n' { 587 res.lines++ 588 } 589 590 return nil 591 } 592 593 // count checks if 2 bytes are the same, returning either 0 or 1, which can 594 // be added directly/branchlessly to totals 595 func count(x, y byte) counter { 596 var c counter 597 if x == y { 598 c = 1 599 } else { 600 c = 0 601 } 602 return c 603 } 604 605 // countLeadingReady finds how many items are ready to show at the start of a 606 // results-slice, which ensures output matches the original item-order 607 func countLeadingReady(values []stats) int { 608 for i, v := range values { 609 if v.result == resultPending { 610 return i 611 } 612 } 613 return len(values) 614 }