File: coby.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath coby.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "errors"
  37     "io"
  38     "io/fs"
  39     "os"
  40     "path/filepath"
  41     "runtime"
  42     "strconv"
  43     "sync"
  44 )
  45 
  46 const info = `
  47 coby [options...] [files/folders...]
  48 
  49 
  50 COunt BYtes finds out some simple byte-related stats, counting
  51 
  52     - bytes
  53     - lines
  54     - how many lines have trailing spaces (trails)
  55     - how many lines end with a CRLF pair
  56     - all-bits-off (null) bytes
  57     - all-bits-on (full) bytes
  58     - top-bit-on (high) bytes
  59     - which unicode byte-order-mark (bom) sequence the data start with
  60 
  61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text
  62 data, and thus may not be meaningful for general binary data.
  63 
  64 The output is TSV (tab-separated values) lines, where the first line has
  65 all the column names.
  66 
  67 When no filepaths are given, the standard input is used by default. All
  68 folder names given expand recursively into all filenames in them. A mix
  69 of files/folders is supported for convenience.
  70 
  71 The only option available is to show this help message, using any of
  72 "-h", "--h", "-help", or "--help", without the quotes.
  73 `
  74 
  75 // header has all the values for the first output line
  76 var header = []string{
  77     `name`,
  78     `bytes`,
  79     `lines`,
  80     `lf`,
  81     `crlf`,
  82     `spaces`,
  83     `tabs`,
  84     `trails`,
  85     `nulls`,
  86     `fulls`,
  87     `highs`,
  88     `bom`,
  89 }
  90 
  91 // event has what the output-reporting task needs to show the results of a
  92 // task which has just completed, perhaps unsuccessfully
  93 type event struct {
  94     // Index points to the task's entry in the results-slice
  95     Index int
  96 
  97     // Stats has all the byte-related stats
  98     Stats stats
  99 
 100     // Err is the completed task's error, or lack of
 101     Err error
 102 }
 103 
 104 func main() {
 105     args := os.Args[1:]
 106 
 107     if len(args) > 0 {
 108         switch args[0] {
 109         case `-h`, `--h`, `-help`, `--help`:
 110             os.Stdout.WriteString(info[1:])
 111             return
 112 
 113         case `--`:
 114             args = args[1:]
 115         }
 116     }
 117 
 118     // show first/heading line right away, to let users know things are
 119     // happening
 120     for i, s := range header {
 121         if i > 0 {
 122             os.Stdout.WriteString("\t")
 123         }
 124         os.Stdout.WriteString(s)
 125     }
 126     // assume an error means later stages/apps in a pipe had enough input and
 127     // quit successfully, so quit successfully too
 128     _, err := os.Stdout.WriteString("\n")
 129     if err != nil {
 130         return
 131     }
 132 
 133     // names has all filepaths given, ignoring repetitions
 134     names, ok := findAllFiles(deduplicate(args))
 135     if !ok {
 136         os.Exit(1)
 137     }
 138     if len(names) == 0 {
 139         names = []string{`-`}
 140     }
 141 
 142     events := make(chan event)
 143     // runtime.GOMAXPROCS(runtime.NumCPU())
 144     go handleInputs(names, events)
 145     if !handleOutput(os.Stdout, len(names), events) {
 146         os.Exit(1)
 147     }
 148 }
 149 
 150 type asyncArgs struct {
 151     Results chan event
 152 
 153     // Permissions limits how many worker tasks can be active at the same
 154     // time: when given many filepaths to work on, rate-limiting avoids
 155     // a massive number of concurrent tasks which read and process input
 156     Permissions chan struct{}
 157 
 158     // Tasks is to wait for all tasks to end before quitting the app
 159     Tasks *sync.WaitGroup
 160 }
 161 
 162 // handleInputs launches all the tasks which do the actual work, limiting how
 163 // many inputs are being worked on at the same time
 164 func handleInputs(names []string, events chan event) {
 165     var inputs sync.WaitGroup
 166     // the number of tasks is always known in advance
 167     inputs.Add(len(names))
 168 
 169     args := asyncArgs{
 170         Results:     events,
 171         Permissions: make(chan struct{}, runtime.NumCPU()),
 172         Tasks:       &inputs,
 173     }
 174 
 175     defer close(args.Results) // allow the output-reporter task to end
 176     defer close(args.Permissions)
 177 
 178     for i, name := range names {
 179         // wait until some concurrency-room is available, before proceeding
 180         args.Permissions <- struct{}{}
 181         go handleInputAsync(i, name, args)
 182     }
 183 
 184     // wait for all inputs, before closing the `events` channel, which in turn
 185     // would quit the whole app right away
 186     args.Tasks.Wait()
 187 }
 188 
 189 // handleInputAsync is the dispatched func used in func handleInputs
 190 func handleInputAsync(i int, name string, args asyncArgs) {
 191     res, err := handleInput(name)
 192     <-args.Permissions
 193     args.Results <- event{Index: i, Stats: res, Err: err}
 194     args.Tasks.Done()
 195 }
 196 
 197 // handleInput handles each work-item for func handleInputs
 198 func handleInput(path string) (stats, error) {
 199     var res stats
 200     res.name = path
 201 
 202     if path == `-` {
 203         err := res.updateStats(os.Stdin)
 204         return res, err
 205     }
 206 
 207     f, err := os.Open(path)
 208     if err != nil {
 209         res.result = resultError
 210         // on windows, file-not-found error messages may mention `CreateFile`,
 211         // even when trying to open files in read-only mode
 212         return res, errors.New(`can't open file named ` + path)
 213     }
 214     defer f.Close()
 215 
 216     err = res.updateStats(f)
 217     return res, err
 218 }
 219 
 220 // handleOutput asynchronously updates output as results are known, whether
 221 // it's errors or successful results; returns whether it succeeded, which
 222 // means no errors happened
 223 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) {
 224     ok = true
 225     bw := bufio.NewWriter(w)
 226     defer bw.Flush()
 227 
 228     results := make([]stats, rescount)
 229 
 230     // keep track of which tasks are over, so that on each event all leading
 231     // results which are ready are shown: all of this ensures prompt output
 232     // updates as soon as results come in, while keeping the original order
 233     // of the names/filepaths given
 234     resultsLeft := results
 235 
 236     for v := range events {
 237         results[v.Index] = v.Stats
 238         if v.Err != nil {
 239             ok = false
 240             bw.Flush()
 241             showError(v.Err)
 242 
 243             // stay in the current loop, in case this failure was keeping
 244             // previous successes from showing up
 245         }
 246 
 247         for len(resultsLeft) > 0 {
 248             if resultsLeft[0].result == resultPending {
 249                 break
 250             }
 251 
 252             if err := showResult(bw, resultsLeft[0]); err != nil {
 253                 // assume later stages/apps in a pipe had enough input
 254                 return ok
 255             }
 256             resultsLeft = resultsLeft[1:]
 257         }
 258 
 259         // show leading results immediately, if any
 260         bw.Flush()
 261     }
 262 
 263     return ok
 264 }
 265 
 266 func showError(err error) {
 267     os.Stderr.WriteString(err.Error())
 268     os.Stderr.WriteString("\n")
 269 }
 270 
 271 // showResult does what it says
 272 func showResult(w *bufio.Writer, s stats) error {
 273     if s.result == resultError {
 274         return nil
 275     }
 276 
 277     var buf [64]byte
 278     w.WriteString(s.name)
 279     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10))
 280     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10))
 281     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10))
 282     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10))
 283     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10))
 284     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10))
 285     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10))
 286     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10))
 287     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10))
 288     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10))
 289     w.WriteByte('\t')
 290     w.WriteString(bomLegend[s.bom])
 291     return w.WriteByte('\n')
 292 }
 293 
 294 // deduplicate avoids repeating items, keeping the original slice unchanged
 295 func deduplicate(src []string) []string {
 296     var unique []string
 297     got := make(map[string]struct{})
 298 
 299     for _, s := range src {
 300         if _, ok := got[s]; ok {
 301             continue
 302         }
 303 
 304         unique = append(unique, s)
 305         got[s] = struct{}{}
 306     }
 307 
 308     return unique
 309 }
 310 
 311 // findAllFiles can be given a mix of file/folder paths, finding all files
 312 // recursively in folders, avoiding duplicates
 313 func findAllFiles(paths []string) (found []string, ok bool) {
 314     res := make(chan any)
 315     var all sync.WaitGroup
 316     all.Add(1)
 317 
 318     go func() {
 319         defer all.Done()
 320         got := make(map[string]struct{})
 321         ok = true
 322 
 323         for v := range res {
 324             if err, ok := v.(error); ok {
 325                 showError(err)
 326                 ok = false
 327                 continue
 328             }
 329 
 330             s, ok := v.(string)
 331             if !ok {
 332                 showError(errors.New(`value is neither string nor error`))
 333                 ok = false
 334                 continue
 335             }
 336 
 337             if _, ok := got[s]; ok {
 338                 continue
 339             }
 340 
 341             got[s] = struct{}{}
 342             found = append(found, s)
 343         }
 344     }()
 345 
 346     rec := func(path string, info fs.DirEntry, err error) error {
 347         if err != nil {
 348             res <- err
 349             return err
 350         }
 351 
 352         if info.IsDir() {
 353             return nil
 354         }
 355 
 356         res <- path
 357         return nil
 358     }
 359 
 360     for _, s := range paths {
 361         // a dash means standard input
 362         if s == `-` {
 363             res <- s
 364             continue
 365         }
 366 
 367         info, err := os.Stat(s)
 368         if os.IsNotExist(err) {
 369             // on windows, file-not-found messages may mention `CreateFile`,
 370             // even when trying to open files in read-only mode
 371             res <- errors.New(`can't find file/folder named ` + s)
 372             continue
 373         }
 374 
 375         if err != nil {
 376             res <- err
 377             continue
 378         }
 379 
 380         if !info.IsDir() {
 381             res <- s
 382             continue
 383         }
 384 
 385         if err := filepath.WalkDir(s, rec); err != nil {
 386             res <- err
 387         }
 388     }
 389 
 390     close(res)
 391     all.Wait()
 392 
 393     return found, ok
 394 }
 395 
 396 // counter makes it easy to change the int-size of almost all counters
 397 type counter uint64
 398 
 399 // statResult constrains possible result-states/values in type stats
 400 type statResult int
 401 
 402 const (
 403     // resultPending is the default not-yet-ready result-status
 404     resultPending = statResult(0)
 405 
 406     // resultError means result should show as an error, instead of data
 407     resultError = statResult(1)
 408 
 409     // resultSuccess means a result's stats are ready to show
 410     resultSuccess = statResult(2)
 411 )
 412 
 413 // bomType is the type for the byte-order-mark enumeration
 414 type bomType int
 415 
 416 const (
 417     noBOM      = bomType(0)
 418     utf8BOM    = bomType(1)
 419     utf16leBOM = bomType(2)
 420     utf16beBOM = bomType(3)
 421     utf32leBOM = bomType(4)
 422     utf32beBOM = bomType(5)
 423 )
 424 
 425 // bomLegend has the string-equivalents of the bomType constants
 426 var bomLegend = []string{
 427     ``,
 428     `UTF-8`,
 429     `UTF-16 LE`,
 430     `UTF-16 BE`,
 431     `UTF-32 LE`,
 432     `UTF-32 BE`,
 433 }
 434 
 435 // stats has all the size-stats for some input, as well as a way to
 436 // skip showing results, in case of an error such as `file not found`
 437 type stats struct {
 438     // bytes counts all bytes read
 439     bytes counter
 440 
 441     // lines counts lines, and is 0 only when the byte-count is also 0
 442     lines counter
 443 
 444     // maxWidth is maximum byte-width of lines, excluding carriage-returns
 445     // and/or line-feeds
 446     maxWidth counter
 447 
 448     // nulls counts all-bits-off bytes
 449     nulls counter
 450 
 451     // fulls counts all-bits-on bytes
 452     fulls counter
 453 
 454     // highs counts bytes with their `top` (highest-order) bit on
 455     highs counter
 456 
 457     // spaces counts ASCII spaces
 458     spaces counter
 459 
 460     // tabs counts ASCII tabs
 461     tabs counter
 462 
 463     // trailing counts lines with trailing spaces in them
 464     trailing counter
 465 
 466     // lf counts ASCII line-feeds as their own byte-values: this means its
 467     // value will always be at least the same as field `crlf`
 468     lf counter
 469 
 470     // crlf counts ASCII CRLF byte-pairs
 471     crlf counter
 472 
 473     // the type of byte-order mark detected
 474     bom bomType
 475 
 476     // name is the filepath of the file/source these stats are about
 477     name string
 478 
 479     // results keeps track of whether results are valid and/or ready
 480     result statResult
 481 }
 482 
 483 // updateStats does what it says, reading everything from a reader
 484 func (res *stats) updateStats(r io.Reader) error {
 485     err := res.updateUsing(r)
 486     if err == io.EOF {
 487         err = nil
 488     }
 489 
 490     if err == nil {
 491         res.result = resultSuccess
 492     } else {
 493         res.result = resultError
 494     }
 495     return err
 496 }
 497 
 498 func checkBOM(data []byte) bomType {
 499     d := data
 500     l := len(data)
 501 
 502     if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf {
 503         return utf8BOM
 504     }
 505     if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 {
 506         return utf32leBOM
 507     }
 508     if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff {
 509         return utf32beBOM
 510     }
 511     if l >= 2 && data[0] == 0xff && data[1] == 0xfe {
 512         return utf16leBOM
 513     }
 514     if l >= 2 && data[0] == 0xfe && data[1] == 0xff {
 515         return utf16beBOM
 516     }
 517 
 518     return noBOM
 519 }
 520 
 521 // updateUsing helps func updateStats do its job
 522 func (res *stats) updateUsing(r io.Reader) error {
 523     var buf [32 * 1024]byte
 524     var tallies [256]uint64
 525 
 526     var width counter
 527     var prev1, prev2 byte
 528 
 529     for {
 530         n, err := r.Read(buf[:])
 531         if n < 1 {
 532             res.lines = counter(tallies['\n'])
 533             res.tabs = counter(tallies['\t'])
 534             res.spaces = counter(tallies[' '])
 535             res.lf = counter(tallies['\n'])
 536             res.nulls = counter(tallies[0])
 537             res.fulls = counter(tallies[255])
 538             for i := 128; i < len(tallies); i++ {
 539                 res.highs += counter(tallies[i])
 540             }
 541 
 542             if err == io.EOF {
 543                 return res.handleEnd(width, prev1, prev2)
 544             }
 545             return err
 546         }
 547 
 548         chunk := buf[:n]
 549         if res.bytes == 0 {
 550             res.bom = checkBOM(chunk)
 551         }
 552         res.bytes += counter(n)
 553 
 554         for _, b := range chunk {
 555             // count values without branching, because it's fun
 556             tallies[b]++
 557 
 558             if b != '\n' {
 559                 prev2 = prev1
 560                 prev1 = b
 561                 width++
 562                 continue
 563             }
 564 
 565             // handle line-feeds
 566 
 567             crlf := count(prev1, '\r')
 568             res.crlf += crlf
 569 
 570             // count lines with trailing spaces, whether these end with
 571             // a CRLF byte-pair or just a line-feed byte
 572             if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 573                 res.trailing++
 574             }
 575 
 576             // exclude any CR from the current line's width-count
 577             width -= crlf
 578             if res.maxWidth < width {
 579                 res.maxWidth = width
 580             }
 581 
 582             prev2 = prev1
 583             prev1 = b
 584             width = 0
 585         }
 586     }
 587 }
 588 
 589 // handleEnd fixes/finalizes stats when input data end; this func is only
 590 // meant to be used by func updateStats, since it takes some of the latter's
 591 // local variables
 592 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 593     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 594         res.trailing++
 595     }
 596 
 597     if res.maxWidth < width {
 598         res.maxWidth = width
 599     }
 600 
 601     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 602     // standard cmd-line tool `wc`
 603     if res.bytes > 0 && prev1 != '\n' {
 604         res.lines++
 605     }
 606 
 607     return nil
 608 }
 609 
 610 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 611 // be added directly/branchlessly to totals
 612 func count(x, y byte) counter {
 613     var c counter
 614     if x == y {
 615         c = 1
 616     } else {
 617         c = 0
 618     }
 619     return c
 620 }