File: coby.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath coby.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "errors"
  37     "io"
  38     "io/fs"
  39     "os"
  40     "path/filepath"
  41     "runtime"
  42     "strconv"
  43     "sync"
  44 )
  45 
  46 const info = `
  47 coby [options...] [files/folders...]
  48 
  49 
  50 COunt BYtes finds out some simple byte-related stats, counting
  51 
  52     - bytes
  53     - lines
  54     - how many lines have trailing spaces (trails)
  55     - how many lines end with a CRLF pair
  56     - all-bits-off (null) bytes
  57     - all-bits-on (full) bytes
  58     - top-bit-on (high) bytes
  59     - which unicode byte-order-mark (bom) sequence the data start with
  60 
  61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text
  62 data, and thus may not be meaningful for general binary data.
  63 
  64 The output is TSV (tab-separated values) lines, where the first line has
  65 all the column names.
  66 
  67 When no filepaths are given, the standard input is used by default. All
  68 folder names given expand recursively into all filenames in them. A mix
  69 of files/folders is supported for convenience.
  70 
  71 The only option available is to show this help message, using any of
  72 "-h", "--h", "-help", or "--help", without the quotes.
  73 `
  74 
  75 // header has all the values for the first output line
  76 var header = []string{
  77     `name`,
  78     `bytes`,
  79     `lines`,
  80     `lf`,
  81     `crlf`,
  82     `spaces`,
  83     `tabs`,
  84     `trails`,
  85     `nulls`,
  86     `fulls`,
  87     `highs`,
  88     `bom`,
  89 }
  90 
  91 // event has what the output-reporting task needs to show the results of a
  92 // task which has just completed, perhaps unsuccessfully
  93 type event struct {
  94     // Index points to the task's entry in the results-slice
  95     Index int
  96 
  97     // Stats has all the byte-related stats
  98     Stats stats
  99 
 100     // Err is the completed task's error, or lack of
 101     Err error
 102 }
 103 
 104 func main() {
 105     args := os.Args[1:]
 106 
 107     if len(args) > 0 {
 108         switch args[0] {
 109         case `-h`, `--h`, `-help`, `--help`:
 110             os.Stderr.WriteString(info[1:])
 111             return
 112 
 113         case `--`:
 114             args = args[1:]
 115         }
 116     }
 117 
 118     // show first/heading line right away, to let users know things are
 119     // happening
 120     for i, s := range header {
 121         if i > 0 {
 122             os.Stdout.WriteString("\t")
 123         }
 124         os.Stdout.WriteString(s)
 125     }
 126     // assume an error means later stages/apps in a pipe had enough input and
 127     // quit successfully, so quit successfully too
 128     _, err := os.Stdout.WriteString("\n")
 129     if err != nil {
 130         return
 131     }
 132 
 133     // names has all filepaths given, ignoring repetitions
 134     names, ok := findAllFiles(deduplicate(args))
 135     if !ok {
 136         os.Exit(1)
 137     }
 138     if len(names) == 0 {
 139         names = []string{`-`}
 140     }
 141 
 142     events := make(chan event)
 143     go handleInputs(names, events)
 144     if !handleOutput(os.Stdout, len(names), events) {
 145         os.Exit(1)
 146     }
 147 }
 148 
 149 // handleInputs launches all the tasks which do the actual work, limiting how
 150 // many inputs are being worked on at the same time
 151 func handleInputs(names []string, events chan event) {
 152     // allow output-reporter task to end, and thus the app
 153     defer close(events)
 154 
 155     // permissions limits how many worker tasks can be active at the same
 156     // time: when given many filepaths to work on, rate-limiting avoids
 157     // a massive number of concurrent tasks which read and process input
 158     permissions := make(chan struct{}, runtime.NumCPU())
 159     defer close(permissions)
 160 
 161     var inputs sync.WaitGroup
 162     for i := range names {
 163         // wait until some concurrency-room is available
 164         permissions <- struct{}{}
 165         inputs.Add(1)
 166 
 167         go func(i int) {
 168             defer inputs.Done()
 169             defer func() { <-permissions }()
 170             res, err := handleInput(names[i])
 171             events <- event{i, res, err}
 172         }(i)
 173     }
 174 
 175     // wait for all inputs, before closing the `events` channel
 176     inputs.Wait()
 177 }
 178 
 179 // handleInput handles each work-item for func handleInputs
 180 func handleInput(path string) (stats, error) {
 181     var res stats
 182     res.name = path
 183 
 184     if path == `-` {
 185         err := res.updateStats(os.Stdin)
 186         return res, err
 187     }
 188 
 189     f, err := os.Open(path)
 190     if err != nil {
 191         res.result = resultError
 192         // on windows, file-not-found error messages may mention `CreateFile`,
 193         // even when trying to open files in read-only mode
 194         return res, errors.New(`can't open file named ` + path)
 195     }
 196     defer f.Close()
 197 
 198     err = res.updateStats(f)
 199     return res, err
 200 }
 201 
 202 // handleOutput asynchronously updates output as results are known, whether
 203 // it's errors or successful results; returns whether it succeeded, which
 204 // means no errors happened
 205 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) {
 206     ok = true
 207     bw := bufio.NewWriter(w)
 208     defer bw.Flush()
 209 
 210     results := make([]stats, rescount)
 211 
 212     // keep track of which tasks are over, so that on each event all leading
 213     // results which are ready are shown: all of this ensures prompt output
 214     // updates as soon as results come in, while keeping the original order
 215     // of the names/filepaths given
 216     resultsLeft := results
 217 
 218     for v := range events {
 219         results[v.Index] = v.Stats
 220         if v.Err != nil {
 221             ok = false
 222             bw.Flush()
 223             showError(v.Err)
 224 
 225             // stay in the current loop, in case this failure was keeping
 226             // previous successes from showing up
 227         }
 228 
 229         n := countLeadingReady(resultsLeft)
 230 
 231         for _, res := range resultsLeft[:n] {
 232             if err := showResult(bw, res); err != nil {
 233                 // assume later stages/apps in a pipe had enough input and
 234                 // quit successfully, so quit successfully too
 235                 return true
 236             }
 237         }
 238         resultsLeft = resultsLeft[n:]
 239 
 240         // flush output-buffer only if anything new was shown
 241         if n > 0 {
 242             bw.Flush()
 243         }
 244     }
 245 
 246     return ok
 247 }
 248 
 249 func showError(err error) {
 250     os.Stderr.WriteString(err.Error())
 251     os.Stderr.WriteString("\n")
 252 }
 253 
 254 // showResult does what it says
 255 func showResult(w *bufio.Writer, s stats) error {
 256     if s.result == resultError {
 257         return nil
 258     }
 259 
 260     var buf [64]byte
 261     w.WriteString(s.name)
 262     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10))
 263     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10))
 264     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10))
 265     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10))
 266     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10))
 267     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10))
 268     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10))
 269     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10))
 270     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10))
 271     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10))
 272     w.WriteByte('\t')
 273     w.WriteString(bomLegend[s.bom])
 274     return w.WriteByte('\n')
 275 }
 276 
 277 // deduplicate avoids repeating items, keeping the original slice unchanged
 278 func deduplicate(src []string) []string {
 279     var unique []string
 280     got := make(map[string]struct{})
 281 
 282     for _, s := range src {
 283         if _, ok := got[s]; ok {
 284             continue
 285         }
 286 
 287         unique = append(unique, s)
 288         got[s] = struct{}{}
 289     }
 290 
 291     return unique
 292 }
 293 
 294 // findAllFiles can be given a mix of file/folder paths, finding all files
 295 // recursively in folders, avoiding duplicates
 296 func findAllFiles(paths []string) (found []string, ok bool) {
 297     res := make(chan any)
 298     var all sync.WaitGroup
 299     all.Add(1)
 300 
 301     go func() {
 302         defer all.Done()
 303         got := make(map[string]struct{})
 304         ok = true
 305 
 306         for v := range res {
 307             if err, ok := v.(error); ok {
 308                 showError(err)
 309                 ok = false
 310                 continue
 311             }
 312 
 313             s, ok := v.(string)
 314             if !ok {
 315                 showError(errors.New(`value is neither string nor error`))
 316                 ok = false
 317                 continue
 318             }
 319 
 320             if _, ok := got[s]; ok {
 321                 continue
 322             }
 323 
 324             got[s] = struct{}{}
 325             found = append(found, s)
 326         }
 327     }()
 328 
 329     rec := func(path string, info fs.DirEntry, err error) error {
 330         if err != nil {
 331             res <- err
 332             return err
 333         }
 334 
 335         if info.IsDir() {
 336             return nil
 337         }
 338 
 339         res <- path
 340         return nil
 341     }
 342 
 343     for _, s := range paths {
 344         // a dash means standard input
 345         if s == `-` {
 346             res <- s
 347             continue
 348         }
 349 
 350         info, err := os.Stat(s)
 351         if os.IsNotExist(err) {
 352             // on windows, file-not-found messages may mention `CreateFile`,
 353             // even when trying to open files in read-only mode
 354             res <- errors.New(`can't find file/folder named ` + s)
 355             continue
 356         }
 357 
 358         if err != nil {
 359             res <- err
 360             continue
 361         }
 362 
 363         if !info.IsDir() {
 364             res <- s
 365             continue
 366         }
 367 
 368         if err := filepath.WalkDir(s, rec); err != nil {
 369             res <- err
 370         }
 371     }
 372 
 373     close(res)
 374     all.Wait()
 375 
 376     return found, ok
 377 }
 378 
 379 // counter makes it easy to change the int-size of almost all counters
 380 type counter uint64
 381 
 382 // statResult constrains possible result-states/values in type stats
 383 type statResult int
 384 
 385 const (
 386     // resultPending is the default not-yet-ready result-status
 387     resultPending = statResult(0)
 388 
 389     // resultError means result should show as an error, instead of data
 390     resultError = statResult(1)
 391 
 392     // resultSuccess means a result's stats are ready to show
 393     resultSuccess = statResult(2)
 394 )
 395 
 396 // bomType is the type for the byte-order-mark enumeration
 397 type bomType int
 398 
 399 const (
 400     noBOM      = bomType(0)
 401     utf8BOM    = bomType(1)
 402     utf16leBOM = bomType(2)
 403     utf16beBOM = bomType(3)
 404     utf32leBOM = bomType(4)
 405     utf32beBOM = bomType(5)
 406 )
 407 
 408 // bomLegend has the string-equivalents of the bomType constants
 409 var bomLegend = []string{
 410     ``,
 411     `UTF-8`,
 412     `UTF-16 LE`,
 413     `UTF-16 BE`,
 414     `UTF-32 LE`,
 415     `UTF-32 BE`,
 416 }
 417 
 418 // stats has all the size-stats for some input, as well as a way to
 419 // skip showing results, in case of an error such as `file not found`
 420 type stats struct {
 421     // bytes counts all bytes read
 422     bytes counter
 423 
 424     // lines counts lines, and is 0 only when the byte-count is also 0
 425     lines counter
 426 
 427     // maxWidth is maximum byte-width of lines, excluding carriage-returns
 428     // and/or line-feeds
 429     maxWidth counter
 430 
 431     // nulls counts all-bits-off bytes
 432     nulls counter
 433 
 434     // fulls counts all-bits-on bytes
 435     fulls counter
 436 
 437     // highs counts bytes with their `top` (highest-order) bit on
 438     highs counter
 439 
 440     // spaces counts ASCII spaces
 441     spaces counter
 442 
 443     // tabs counts ASCII tabs
 444     tabs counter
 445 
 446     // trailing counts lines with trailing spaces in them
 447     trailing counter
 448 
 449     // lf counts ASCII line-feeds as their own byte-values: this means its
 450     // value will always be at least the same as field `crlf`
 451     lf counter
 452 
 453     // crlf counts ASCII CRLF byte-pairs
 454     crlf counter
 455 
 456     // the type of byte-order mark detected
 457     bom bomType
 458 
 459     // name is the filepath of the file/source these stats are about
 460     name string
 461 
 462     // results keeps track of whether results are valid and/or ready
 463     result statResult
 464 }
 465 
 466 // updateStats does what it says, reading everything from a reader
 467 func (res *stats) updateStats(r io.Reader) error {
 468     err := res.updateUsing(r)
 469     if err == io.EOF {
 470         err = nil
 471     }
 472 
 473     if err == nil {
 474         res.result = resultSuccess
 475     } else {
 476         res.result = resultError
 477     }
 478     return err
 479 }
 480 
 481 func checkBOM(data []byte) bomType {
 482     d := data
 483     l := len(data)
 484 
 485     if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf {
 486         return utf8BOM
 487     }
 488     if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 {
 489         return utf32leBOM
 490     }
 491     if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff {
 492         return utf32beBOM
 493     }
 494     if l >= 2 && data[0] == 0xff && data[1] == 0xfe {
 495         return utf16leBOM
 496     }
 497     if l >= 2 && data[0] == 0xfe && data[1] == 0xff {
 498         return utf16beBOM
 499     }
 500 
 501     return noBOM
 502 }
 503 
 504 // updateUsing helps func updateStats do its job
 505 func (res *stats) updateUsing(r io.Reader) error {
 506     var buf [32 * 1024]byte
 507     var tallies [256]uint64
 508 
 509     var width counter
 510     var prev1, prev2 byte
 511 
 512     for {
 513         n, err := r.Read(buf[:])
 514         if n < 1 {
 515             res.lines = counter(tallies['\n'])
 516             res.tabs = counter(tallies['\t'])
 517             res.spaces = counter(tallies[' '])
 518             res.lf = counter(tallies['\n'])
 519             res.nulls = counter(tallies[0])
 520             res.fulls = counter(tallies[255])
 521             for i := 128; i < 256; i++ {
 522                 res.highs += counter(tallies[i])
 523             }
 524 
 525             if err == io.EOF {
 526                 return res.handleEnd(width, prev1, prev2)
 527             }
 528             return err
 529         }
 530 
 531         chunk := buf[:n]
 532         if res.bytes == 0 {
 533             res.bom = checkBOM(chunk)
 534         }
 535         res.bytes += counter(n)
 536 
 537         for _, b := range chunk {
 538             // count values without branching, because it's fun
 539             tallies[b]++
 540 
 541             if b != '\n' {
 542                 prev2 = prev1
 543                 prev1 = b
 544                 width++
 545                 continue
 546             }
 547 
 548             // handle line-feeds
 549 
 550             crlf := count(prev1, '\r')
 551             res.crlf += crlf
 552 
 553             // count lines with trailing spaces, whether these end with
 554             // a CRLF byte-pair or just a line-feed byte
 555             if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 556                 res.trailing++
 557             }
 558 
 559             // exclude any CR from the current line's width-count
 560             width -= crlf
 561             if res.maxWidth < width {
 562                 res.maxWidth = width
 563             }
 564 
 565             prev2 = prev1
 566             prev1 = b
 567             width = 0
 568         }
 569     }
 570 }
 571 
 572 // handleEnd fixes/finalizes stats when input data end; this func is only
 573 // meant to be used by func updateStats, since it takes some of the latter's
 574 // local variables
 575 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 576     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 577         res.trailing++
 578     }
 579 
 580     if res.maxWidth < width {
 581         res.maxWidth = width
 582     }
 583 
 584     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 585     // standard cmd-line tool `wc`
 586     if res.bytes > 0 && prev1 != '\n' {
 587         res.lines++
 588     }
 589 
 590     return nil
 591 }
 592 
 593 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 594 // be added directly/branchlessly to totals
 595 func count(x, y byte) counter {
 596     var c counter
 597     if x == y {
 598         c = 1
 599     } else {
 600         c = 0
 601     }
 602     return c
 603 }
 604 
 605 // countLeadingReady finds how many items are ready to show at the start of a
 606 // results-slice, which ensures output matches the original item-order
 607 func countLeadingReady(values []stats) int {
 608     for i, v := range values {
 609         if v.result == resultPending {
 610             return i
 611         }
 612     }
 613     return len(values)
 614 }