File: coby.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath coby.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "errors"
  37     "io"
  38     "io/fs"
  39     "os"
  40     "path/filepath"
  41     "runtime"
  42     "strconv"
  43     "sync"
  44 )
  45 
  46 const info = `
  47 coby [options...] [files/folders...]
  48 
  49 
  50 COunt BYtes finds out some simple byte-related stats, counting
  51 
  52     - bytes
  53     - lines
  54     - how many lines have trailing spaces (trails)
  55     - how many lines end with a CRLF pair
  56     - all-bits-off (null) bytes
  57     - all-bits-on (full) bytes
  58     - top-bit-on (high) bytes
  59     - which unicode byte-order-mark (bom) sequence the data start with
  60 
  61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text
  62 data, and thus may not be meaningful for general binary data.
  63 
  64 The output is TSV (tab-separated values) lines, where the first line has
  65 all the column names.
  66 
  67 When no filepaths are given, the standard input is used by default. All
  68 folder names given expand recursively into all filenames in them. A mix
  69 of files/folders is supported for convenience.
  70 
  71 The only option available is to show this help message, using any of
  72 "-h", "--h", "-help", or "--help", without the quotes.
  73 `
  74 
  75 // header has all the values for the first output line
  76 var header = []string{
  77     `name`,
  78     `bytes`,
  79     `lines`,
  80     `lf`,
  81     `crlf`,
  82     `spaces`,
  83     `tabs`,
  84     `trails`,
  85     `nulls`,
  86     `fulls`,
  87     `highs`,
  88     `bom`,
  89 }
  90 
  91 // event has what the output-reporting task needs to show the results of a
  92 // task which has just completed, perhaps unsuccessfully
  93 type event struct {
  94     // Index points to the task's entry in the results-slice
  95     Index int
  96 
  97     // Stats has all the byte-related stats
  98     Stats stats
  99 
 100     // Err is the completed task's error, or lack of
 101     Err error
 102 }
 103 
 104 func main() {
 105     args := os.Args[1:]
 106 
 107     if len(args) > 0 {
 108         switch args[0] {
 109         case `-h`, `--h`, `-help`, `--help`:
 110             os.Stdout.WriteString(info[1:])
 111             return
 112 
 113         case `--`:
 114             args = args[1:]
 115         }
 116     }
 117 
 118     // show first/heading line right away, to let users know things are
 119     // happening
 120     for i, s := range header {
 121         if i > 0 {
 122             os.Stdout.WriteString("\t")
 123         }
 124         os.Stdout.WriteString(s)
 125     }
 126     // assume an error means later stages/apps in a pipe had enough input and
 127     // quit successfully, so quit successfully too
 128     _, err := os.Stdout.WriteString("\n")
 129     if err != nil {
 130         return
 131     }
 132 
 133     // names has all filepaths given, ignoring repetitions
 134     names, ok := findAllFiles(args)
 135     if !ok {
 136         os.Exit(1)
 137     }
 138     if len(names) == 0 {
 139         names = []string{`-`}
 140     }
 141 
 142     events := make(chan event)
 143     go handleInputs(names, events)
 144     if !handleOutput(os.Stdout, len(names), events) {
 145         os.Exit(1)
 146     }
 147 }
 148 
 149 // handleInputs launches all the tasks which do the actual work, limiting how
 150 // many inputs are being worked on at the same time
 151 func handleInputs(names []string, events chan<- event) {
 152     defer close(events) // allow the output-reporter task to end
 153 
 154     var tasks sync.WaitGroup
 155     // the number of tasks is always known in advance
 156     tasks.Add(len(names))
 157 
 158     // permissions is buffered to limit concurrency to the core-count
 159     permissions := make(chan struct{}, runtime.NumCPU())
 160     defer close(permissions)
 161 
 162     for i, name := range names {
 163         // wait until some concurrency-room is available, before proceeding
 164         permissions <- struct{}{}
 165 
 166         go func(i int, name string) {
 167             defer tasks.Done()
 168 
 169             res, err := handleInput(name)
 170             <-permissions
 171             events <- event{Index: i, Stats: res, Err: err}
 172         }(i, name)
 173     }
 174 
 175     // wait for all inputs, before closing the `events` channel, which in turn
 176     // would quit the whole app right away
 177     tasks.Wait()
 178 }
 179 
 180 // handleInput handles each work-item for func handleInputs
 181 func handleInput(path string) (stats, error) {
 182     var res stats
 183     res.name = path
 184 
 185     if path == `-` {
 186         err := res.updateStats(os.Stdin)
 187         return res, err
 188     }
 189 
 190     f, err := os.Open(path)
 191     if err != nil {
 192         res.result = resultError
 193         // on windows, file-not-found error messages may mention `CreateFile`,
 194         // even when trying to open files in read-only mode
 195         return res, errors.New(`can't open file named ` + path)
 196     }
 197     defer f.Close()
 198 
 199     err = res.updateStats(f)
 200     return res, err
 201 }
 202 
 203 // handleOutput asynchronously updates output as results are known, whether
 204 // it's errors or successful results; returns whether it succeeded, which
 205 // means no errors happened
 206 func handleOutput(w io.Writer, inputs int, events <-chan event) (ok bool) {
 207     bw := bufio.NewWriter(w)
 208     defer bw.Flush()
 209 
 210     ok = true
 211     results := make([]stats, inputs)
 212 
 213     // keep track of which tasks are over, so that on each event all leading
 214     // results which are ready are shown: all of this ensures prompt output
 215     // updates as soon as results come in, while keeping the original order
 216     // of the names/filepaths given
 217     resultsLeft := results
 218 
 219     for v := range events {
 220         results[v.Index] = v.Stats
 221         if v.Err != nil {
 222             ok = false
 223             bw.Flush()
 224             showError(v.Err)
 225 
 226             // stay in the current loop, in case this failure was keeping
 227             // previous successes from showing up
 228         }
 229 
 230         for len(resultsLeft) > 0 {
 231             if resultsLeft[0].result == resultPending {
 232                 break
 233             }
 234 
 235             if err := showResult(bw, resultsLeft[0]); err != nil {
 236                 // assume later stages/apps in a pipe had enough input
 237                 return ok
 238             }
 239             resultsLeft = resultsLeft[1:]
 240         }
 241 
 242         // show leading results immediately, if any
 243         bw.Flush()
 244     }
 245 
 246     return ok
 247 }
 248 
 249 func showError(err error) {
 250     os.Stderr.WriteString(err.Error())
 251     os.Stderr.WriteString("\n")
 252 }
 253 
 254 // showResult shows a TSV line for results marked as successful, doing nothing
 255 // when given other types of results
 256 func showResult(w *bufio.Writer, s stats) error {
 257     if s.result != resultSuccess {
 258         return nil
 259     }
 260 
 261     var buf [24]byte
 262     w.WriteString(s.name)
 263     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10))
 264     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10))
 265     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10))
 266     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10))
 267     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10))
 268     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10))
 269     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10))
 270     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10))
 271     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10))
 272     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10))
 273     w.WriteByte('\t')
 274     w.WriteString(bomLegend[s.bom])
 275     return w.WriteByte('\n')
 276 }
 277 
 278 // findAllFiles can be given a mix of file/folder paths, finding all files
 279 // recursively in folders, avoiding duplicates
 280 func findAllFiles(paths []string) (files []string, success bool) {
 281     walk := filepath.WalkDir
 282     got := make(map[string]struct{})
 283     success = true
 284 
 285     for _, path := range paths {
 286         if _, ok := got[path]; ok {
 287             continue
 288         }
 289         got[path] = struct{}{}
 290 
 291         // a dash means standard input
 292         if path == `-` {
 293             files = append(files, path)
 294             continue
 295         }
 296 
 297         info, err := os.Stat(path)
 298         if os.IsNotExist(err) {
 299             // on windows, file-not-found messages may mention `CreateFile`,
 300             // even when trying to open files in read-only mode
 301             err = errors.New(`can't find file/folder named ` + path)
 302         }
 303 
 304         if err != nil {
 305             showError(err)
 306             success = false
 307             continue
 308         }
 309 
 310         if !info.IsDir() {
 311             files = append(files, path)
 312             continue
 313         }
 314 
 315         err = walk(path, func(path string, info fs.DirEntry, err error) error {
 316             path, err = filepath.Abs(path)
 317             if err != nil {
 318                 showError(err)
 319                 success = false
 320                 return err
 321             }
 322 
 323             if _, ok := got[path]; ok {
 324                 if info.IsDir() {
 325                     return fs.SkipDir
 326                 }
 327                 return nil
 328             }
 329             got[path] = struct{}{}
 330 
 331             if err != nil {
 332                 showError(err)
 333                 success = false
 334                 return err
 335             }
 336 
 337             if info.IsDir() {
 338                 return nil
 339             }
 340 
 341             files = append(files, path)
 342             return nil
 343         })
 344 
 345         if err != nil {
 346             showError(err)
 347             success = false
 348         }
 349     }
 350 
 351     return files, success
 352 }
 353 
 354 // counter makes it easy to change the int-size of almost all counters
 355 type counter uint64
 356 
 357 // statResult constrains possible result-states/values in type stats
 358 type statResult int
 359 
 360 const (
 361     // resultPending is the default not-yet-ready result-status
 362     resultPending = statResult(0)
 363 
 364     // resultError means result should show as an error, instead of data
 365     resultError = statResult(1)
 366 
 367     // resultSuccess means a result's stats are ready to show
 368     resultSuccess = statResult(2)
 369 )
 370 
 371 // bomType is the type for the byte-order-mark enumeration
 372 type bomType int
 373 
 374 const (
 375     noBOM      = bomType(0)
 376     utf8BOM    = bomType(1)
 377     utf16leBOM = bomType(2)
 378     utf16beBOM = bomType(3)
 379     utf32leBOM = bomType(4)
 380     utf32beBOM = bomType(5)
 381 )
 382 
 383 // bomLegend has the string-equivalents of the bomType constants
 384 var bomLegend = []string{
 385     ``,
 386     `UTF-8`,
 387     `UTF-16 LE`,
 388     `UTF-16 BE`,
 389     `UTF-32 LE`,
 390     `UTF-32 BE`,
 391 }
 392 
 393 // stats has all the size-stats for some input, as well as a way to
 394 // skip showing results, in case of an error such as `file not found`
 395 type stats struct {
 396     // bytes counts all bytes read
 397     bytes counter
 398 
 399     // lines counts lines, and is 0 only when the byte-count is also 0
 400     lines counter
 401 
 402     // maxWidth is maximum byte-width of lines, excluding carriage-returns
 403     // and/or line-feeds
 404     maxWidth counter
 405 
 406     // nulls counts all-bits-off bytes
 407     nulls counter
 408 
 409     // fulls counts all-bits-on bytes
 410     fulls counter
 411 
 412     // highs counts bytes with their `top` (highest-order) bit on
 413     highs counter
 414 
 415     // spaces counts ASCII spaces
 416     spaces counter
 417 
 418     // tabs counts ASCII tabs
 419     tabs counter
 420 
 421     // trailing counts lines with trailing spaces in them
 422     trailing counter
 423 
 424     // lf counts ASCII line-feeds as their own byte-values: this means its
 425     // value will always be at least the same as field `crlf`
 426     lf counter
 427 
 428     // crlf counts ASCII CRLF byte-pairs
 429     crlf counter
 430 
 431     // the type of byte-order mark detected
 432     bom bomType
 433 
 434     // name is the filepath of the file/source these stats are about
 435     name string
 436 
 437     // results keeps track of whether results are valid and/or ready
 438     result statResult
 439 }
 440 
 441 // updateStats does what it says, reading everything from a reader
 442 func (res *stats) updateStats(r io.Reader) error {
 443     err := res.updateUsing(r)
 444     if err == io.EOF {
 445         err = nil
 446     }
 447 
 448     if err == nil {
 449         res.result = resultSuccess
 450     } else {
 451         res.result = resultError
 452     }
 453     return err
 454 }
 455 
 456 func checkBOM(data []byte) bomType {
 457     d := data
 458     l := len(data)
 459 
 460     if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf {
 461         return utf8BOM
 462     }
 463     if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 {
 464         return utf32leBOM
 465     }
 466     if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff {
 467         return utf32beBOM
 468     }
 469     if l >= 2 && data[0] == 0xff && data[1] == 0xfe {
 470         return utf16leBOM
 471     }
 472     if l >= 2 && data[0] == 0xfe && data[1] == 0xff {
 473         return utf16beBOM
 474     }
 475 
 476     return noBOM
 477 }
 478 
 479 // updateUsing helps func updateStats do its job
 480 func (res *stats) updateUsing(r io.Reader) error {
 481     var buf [32 * 1024]byte
 482     var tallies [256]uint64
 483 
 484     var width counter
 485     var prev1, prev2 byte
 486 
 487     for {
 488         n, err := r.Read(buf[:])
 489         if n < 1 {
 490             res.lines = counter(tallies['\n'])
 491             res.tabs = counter(tallies['\t'])
 492             res.spaces = counter(tallies[' '])
 493             res.lf = counter(tallies['\n'])
 494             res.nulls = counter(tallies[0])
 495             res.fulls = counter(tallies[255])
 496             for i := 128; i < len(tallies); i++ {
 497                 res.highs += counter(tallies[i])
 498             }
 499 
 500             if err == io.EOF {
 501                 return res.handleEnd(width, prev1, prev2)
 502             }
 503             return err
 504         }
 505 
 506         chunk := buf[:n]
 507         if res.bytes == 0 {
 508             res.bom = checkBOM(chunk)
 509         }
 510         res.bytes += counter(n)
 511 
 512         for _, b := range chunk {
 513             // count values without branching, because it's fun
 514             tallies[b]++
 515 
 516             if b != '\n' {
 517                 prev2 = prev1
 518                 prev1 = b
 519                 width++
 520                 continue
 521             }
 522 
 523             // handle line-feeds
 524 
 525             crlf := count(prev1, '\r')
 526             res.crlf += crlf
 527 
 528             // count lines with trailing spaces, whether these end with
 529             // a CRLF byte-pair or just a line-feed byte
 530             if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 531                 res.trailing++
 532             }
 533 
 534             // exclude any CR from the current line's width-count
 535             width -= crlf
 536             if res.maxWidth < width {
 537                 res.maxWidth = width
 538             }
 539 
 540             prev2 = prev1
 541             prev1 = b
 542             width = 0
 543         }
 544     }
 545 }
 546 
 547 // handleEnd fixes/finalizes stats when input data end; this func is only
 548 // meant to be used by func updateStats, since it takes some of the latter's
 549 // local variables
 550 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 551     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 552         res.trailing++
 553     }
 554 
 555     if res.maxWidth < width {
 556         res.maxWidth = width
 557     }
 558 
 559     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 560     // standard cmd-line tool `wc`
 561     if res.bytes > 0 && prev1 != '\n' {
 562         res.lines++
 563     }
 564 
 565     return nil
 566 }
 567 
 568 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 569 // be added directly/branchlessly to totals
 570 func count(x, y byte) counter {
 571     var c counter
 572     if x == y {
 573         c = 1
 574     } else {
 575         c = 0
 576     }
 577     return c
 578 }