File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright (c) 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath coby.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "errors" 37 "io" 38 "io/fs" 39 "os" 40 "path/filepath" 41 "runtime" 42 "strconv" 43 "sync" 44 ) 45 46 const info = ` 47 coby [options...] [files/folders...] 48 49 50 COunt BYtes finds out some simple byte-related stats, counting 51 52 - bytes 53 - lines 54 - how many lines have trailing spaces (trails) 55 - how many lines end with a CRLF pair 56 - all-bits-off (null) bytes 57 - all-bits-on (full) bytes 58 - top-bit-on (high) bytes 59 - which unicode byte-order-mark (bom) sequence the data start with 60 61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text 62 data, and thus may not be meaningful for general binary data. 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. A mix 69 of files/folders is supported for convenience. 70 71 The only option available is to show this help message, using any of 72 "-h", "--h", "-help", or "--help", without the quotes. 73 ` 74 75 // header has all the values for the first output line 76 var header = []string{ 77 `name`, 78 `bytes`, 79 `lines`, 80 `lf`, 81 `crlf`, 82 `spaces`, 83 `tabs`, 84 `trails`, 85 `nulls`, 86 `fulls`, 87 `highs`, 88 `bom`, 89 } 90 91 // event has what the output-reporting task needs to show the results of a 92 // task which has just completed, perhaps unsuccessfully 93 type event struct { 94 // Index points to the task's entry in the results-slice 95 Index int 96 97 // Stats has all the byte-related stats 98 Stats stats 99 100 // Err is the completed task's error, or lack of 101 Err error 102 } 103 104 func main() { 105 args := os.Args[1:] 106 107 if len(args) > 0 { 108 switch args[0] { 109 case `-h`, `--h`, `-help`, `--help`: 110 os.Stdout.WriteString(info[1:]) 111 return 112 113 case `--`: 114 args = args[1:] 115 } 116 } 117 118 // show first/heading line right away, to let users know things are 119 // happening 120 for i, s := range header { 121 if i > 0 { 122 os.Stdout.WriteString("\t") 123 } 124 os.Stdout.WriteString(s) 125 } 126 // assume an error means later stages/apps in a pipe had enough input and 127 // quit successfully, so quit successfully too 128 _, err := os.Stdout.WriteString("\n") 129 if err != nil { 130 return 131 } 132 133 // names has all filepaths given, ignoring repetitions 134 names, ok := findAllFiles(deduplicate(args)) 135 if !ok { 136 os.Exit(1) 137 } 138 if len(names) == 0 { 139 names = []string{`-`} 140 } 141 142 events := make(chan event) 143 // runtime.GOMAXPROCS(runtime.NumCPU()) 144 go handleInputs(names, events) 145 if !handleOutput(os.Stdout, len(names), events) { 146 os.Exit(1) 147 } 148 } 149 150 type asyncArgs struct { 151 Results chan event 152 153 // Permissions limits how many worker tasks can be active at the same 154 // time: when given many filepaths to work on, rate-limiting avoids 155 // a massive number of concurrent tasks which read and process input 156 Permissions chan struct{} 157 158 // Tasks is to wait for all tasks to end before quitting the app 159 Tasks *sync.WaitGroup 160 } 161 162 // handleInputs launches all the tasks which do the actual work, limiting how 163 // many inputs are being worked on at the same time 164 func handleInputs(names []string, events chan event) { 165 var inputs sync.WaitGroup 166 // the number of tasks is always known in advance 167 inputs.Add(len(names)) 168 169 args := asyncArgs{ 170 Results: events, 171 Permissions: make(chan struct{}, runtime.NumCPU()), 172 Tasks: &inputs, 173 } 174 175 defer close(args.Results) // allow the output-reporter task to end 176 defer close(args.Permissions) 177 178 for i, name := range names { 179 // wait until some concurrency-room is available, before proceeding 180 args.Permissions <- struct{}{} 181 go handleInputAsync(i, name, args) 182 } 183 184 // wait for all inputs, before closing the `events` channel, which in turn 185 // would quit the whole app right away 186 args.Tasks.Wait() 187 } 188 189 // handleInputAsync is the dispatched func used in func handleInputs 190 func handleInputAsync(i int, name string, args asyncArgs) { 191 res, err := handleInput(name) 192 <-args.Permissions 193 args.Results <- event{Index: i, Stats: res, Err: err} 194 args.Tasks.Done() 195 } 196 197 // handleInput handles each work-item for func handleInputs 198 func handleInput(path string) (stats, error) { 199 var res stats 200 res.name = path 201 202 if path == `-` { 203 err := res.updateStats(os.Stdin) 204 return res, err 205 } 206 207 f, err := os.Open(path) 208 if err != nil { 209 res.result = resultError 210 // on windows, file-not-found error messages may mention `CreateFile`, 211 // even when trying to open files in read-only mode 212 return res, errors.New(`can't open file named ` + path) 213 } 214 defer f.Close() 215 216 err = res.updateStats(f) 217 return res, err 218 } 219 220 // handleOutput asynchronously updates output as results are known, whether 221 // it's errors or successful results; returns whether it succeeded, which 222 // means no errors happened 223 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) { 224 ok = true 225 bw := bufio.NewWriter(w) 226 defer bw.Flush() 227 228 results := make([]stats, rescount) 229 230 // keep track of which tasks are over, so that on each event all leading 231 // results which are ready are shown: all of this ensures prompt output 232 // updates as soon as results come in, while keeping the original order 233 // of the names/filepaths given 234 resultsLeft := results 235 236 for v := range events { 237 results[v.Index] = v.Stats 238 if v.Err != nil { 239 ok = false 240 bw.Flush() 241 showError(v.Err) 242 243 // stay in the current loop, in case this failure was keeping 244 // previous successes from showing up 245 } 246 247 for len(resultsLeft) > 0 { 248 if resultsLeft[0].result == resultPending { 249 break 250 } 251 252 if err := showResult(bw, resultsLeft[0]); err != nil { 253 // assume later stages/apps in a pipe had enough input 254 return ok 255 } 256 resultsLeft = resultsLeft[1:] 257 } 258 259 // show leading results immediately, if any 260 bw.Flush() 261 } 262 263 return ok 264 } 265 266 func showError(err error) { 267 os.Stderr.WriteString(err.Error()) 268 os.Stderr.WriteString("\n") 269 } 270 271 // showResult does what it says 272 func showResult(w *bufio.Writer, s stats) error { 273 if s.result == resultError { 274 return nil 275 } 276 277 var buf [64]byte 278 w.WriteString(s.name) 279 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10)) 280 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10)) 281 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10)) 282 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10)) 283 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10)) 284 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10)) 285 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10)) 286 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10)) 287 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10)) 288 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10)) 289 w.WriteByte('\t') 290 w.WriteString(bomLegend[s.bom]) 291 return w.WriteByte('\n') 292 } 293 294 // deduplicate avoids repeating items, keeping the original slice unchanged 295 func deduplicate(src []string) []string { 296 var unique []string 297 got := make(map[string]struct{}) 298 299 for _, s := range src { 300 if _, ok := got[s]; ok { 301 continue 302 } 303 304 unique = append(unique, s) 305 got[s] = struct{}{} 306 } 307 308 return unique 309 } 310 311 // findAllFiles can be given a mix of file/folder paths, finding all files 312 // recursively in folders, avoiding duplicates 313 func findAllFiles(paths []string) (found []string, ok bool) { 314 res := make(chan any) 315 var all sync.WaitGroup 316 all.Add(1) 317 318 go func() { 319 defer all.Done() 320 got := make(map[string]struct{}) 321 ok = true 322 323 for v := range res { 324 if err, ok := v.(error); ok { 325 showError(err) 326 ok = false 327 continue 328 } 329 330 s, ok := v.(string) 331 if !ok { 332 showError(errors.New(`value is neither string nor error`)) 333 ok = false 334 continue 335 } 336 337 if _, ok := got[s]; ok { 338 continue 339 } 340 341 got[s] = struct{}{} 342 found = append(found, s) 343 } 344 }() 345 346 rec := func(path string, info fs.DirEntry, err error) error { 347 if err != nil { 348 res <- err 349 return err 350 } 351 352 if info.IsDir() { 353 return nil 354 } 355 356 res <- path 357 return nil 358 } 359 360 for _, s := range paths { 361 // a dash means standard input 362 if s == `-` { 363 res <- s 364 continue 365 } 366 367 info, err := os.Stat(s) 368 if os.IsNotExist(err) { 369 // on windows, file-not-found messages may mention `CreateFile`, 370 // even when trying to open files in read-only mode 371 res <- errors.New(`can't find file/folder named ` + s) 372 continue 373 } 374 375 if err != nil { 376 res <- err 377 continue 378 } 379 380 if !info.IsDir() { 381 res <- s 382 continue 383 } 384 385 if err := filepath.WalkDir(s, rec); err != nil { 386 res <- err 387 } 388 } 389 390 close(res) 391 all.Wait() 392 393 return found, ok 394 } 395 396 // counter makes it easy to change the int-size of almost all counters 397 type counter uint64 398 399 // statResult constrains possible result-states/values in type stats 400 type statResult int 401 402 const ( 403 // resultPending is the default not-yet-ready result-status 404 resultPending = statResult(0) 405 406 // resultError means result should show as an error, instead of data 407 resultError = statResult(1) 408 409 // resultSuccess means a result's stats are ready to show 410 resultSuccess = statResult(2) 411 ) 412 413 // bomType is the type for the byte-order-mark enumeration 414 type bomType int 415 416 const ( 417 noBOM = bomType(0) 418 utf8BOM = bomType(1) 419 utf16leBOM = bomType(2) 420 utf16beBOM = bomType(3) 421 utf32leBOM = bomType(4) 422 utf32beBOM = bomType(5) 423 ) 424 425 // bomLegend has the string-equivalents of the bomType constants 426 var bomLegend = []string{ 427 ``, 428 `UTF-8`, 429 `UTF-16 LE`, 430 `UTF-16 BE`, 431 `UTF-32 LE`, 432 `UTF-32 BE`, 433 } 434 435 // stats has all the size-stats for some input, as well as a way to 436 // skip showing results, in case of an error such as `file not found` 437 type stats struct { 438 // bytes counts all bytes read 439 bytes counter 440 441 // lines counts lines, and is 0 only when the byte-count is also 0 442 lines counter 443 444 // maxWidth is maximum byte-width of lines, excluding carriage-returns 445 // and/or line-feeds 446 maxWidth counter 447 448 // nulls counts all-bits-off bytes 449 nulls counter 450 451 // fulls counts all-bits-on bytes 452 fulls counter 453 454 // highs counts bytes with their `top` (highest-order) bit on 455 highs counter 456 457 // spaces counts ASCII spaces 458 spaces counter 459 460 // tabs counts ASCII tabs 461 tabs counter 462 463 // trailing counts lines with trailing spaces in them 464 trailing counter 465 466 // lf counts ASCII line-feeds as their own byte-values: this means its 467 // value will always be at least the same as field `crlf` 468 lf counter 469 470 // crlf counts ASCII CRLF byte-pairs 471 crlf counter 472 473 // the type of byte-order mark detected 474 bom bomType 475 476 // name is the filepath of the file/source these stats are about 477 name string 478 479 // results keeps track of whether results are valid and/or ready 480 result statResult 481 } 482 483 // updateStats does what it says, reading everything from a reader 484 func (res *stats) updateStats(r io.Reader) error { 485 err := res.updateUsing(r) 486 if err == io.EOF { 487 err = nil 488 } 489 490 if err == nil { 491 res.result = resultSuccess 492 } else { 493 res.result = resultError 494 } 495 return err 496 } 497 498 func checkBOM(data []byte) bomType { 499 d := data 500 l := len(data) 501 502 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 503 return utf8BOM 504 } 505 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 506 return utf32leBOM 507 } 508 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 509 return utf32beBOM 510 } 511 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 512 return utf16leBOM 513 } 514 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 515 return utf16beBOM 516 } 517 518 return noBOM 519 } 520 521 // updateUsing helps func updateStats do its job 522 func (res *stats) updateUsing(r io.Reader) error { 523 var buf [32 * 1024]byte 524 var tallies [256]uint64 525 526 var width counter 527 var prev1, prev2 byte 528 529 for { 530 n, err := r.Read(buf[:]) 531 if n < 1 { 532 res.lines = counter(tallies['\n']) 533 res.tabs = counter(tallies['\t']) 534 res.spaces = counter(tallies[' ']) 535 res.lf = counter(tallies['\n']) 536 res.nulls = counter(tallies[0]) 537 res.fulls = counter(tallies[255]) 538 for i := 128; i < len(tallies); i++ { 539 res.highs += counter(tallies[i]) 540 } 541 542 if err == io.EOF { 543 return res.handleEnd(width, prev1, prev2) 544 } 545 return err 546 } 547 548 chunk := buf[:n] 549 if res.bytes == 0 { 550 res.bom = checkBOM(chunk) 551 } 552 res.bytes += counter(n) 553 554 for _, b := range chunk { 555 // count values without branching, because it's fun 556 tallies[b]++ 557 558 if b != '\n' { 559 prev2 = prev1 560 prev1 = b 561 width++ 562 continue 563 } 564 565 // handle line-feeds 566 567 crlf := count(prev1, '\r') 568 res.crlf += crlf 569 570 // count lines with trailing spaces, whether these end with 571 // a CRLF byte-pair or just a line-feed byte 572 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 573 res.trailing++ 574 } 575 576 // exclude any CR from the current line's width-count 577 width -= crlf 578 if res.maxWidth < width { 579 res.maxWidth = width 580 } 581 582 prev2 = prev1 583 prev1 = b 584 width = 0 585 } 586 } 587 } 588 589 // handleEnd fixes/finalizes stats when input data end; this func is only 590 // meant to be used by func updateStats, since it takes some of the latter's 591 // local variables 592 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 593 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 594 res.trailing++ 595 } 596 597 if res.maxWidth < width { 598 res.maxWidth = width 599 } 600 601 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 602 // standard cmd-line tool `wc` 603 if res.bytes > 0 && prev1 != '\n' { 604 res.lines++ 605 } 606 607 return nil 608 } 609 610 // count checks if 2 bytes are the same, returning either 0 or 1, which can 611 // be added directly/branchlessly to totals 612 func count(x, y byte) counter { 613 var c counter 614 if x == y { 615 c = 1 616 } else { 617 c = 0 618 } 619 return c 620 }