File: coby.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath coby.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "errors"
  37     "io"
  38     "io/fs"
  39     "os"
  40     "path/filepath"
  41     "runtime"
  42     "strconv"
  43     "sync"
  44 )
  45 
  46 const info = `
  47 coby [options...] [files/folders...]
  48 
  49 
  50 COunt BYtes finds out some simple byte-related stats, counting
  51 
  52     - bytes
  53     - lines
  54     - how many lines have trailing spaces (trails)
  55     - how many lines end with a CRLF pair
  56     - all-bits-off (null) bytes
  57     - all-bits-on (full) bytes
  58     - top-bit-on (high) bytes
  59     - which unicode byte-order-mark (bom) sequence the data start with
  60 
  61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text
  62 data, and thus may not be meaningful for general binary data.
  63 
  64 The output is TSV (tab-separated values) lines, where the first line has
  65 all the column names.
  66 
  67 When no filepaths are given, the standard input is used by default. All
  68 folder names given expand recursively into all filenames in them. A mix
  69 of files/folders is supported for convenience.
  70 
  71 The only option available is to show this help message, using any of
  72 "-h", "--h", "-help", or "--help", without the quotes.
  73 `
  74 
  75 // header has all the values for the first output line
  76 var header = []string{
  77     `name`,
  78     `bytes`,
  79     `lines`,
  80     `lf`,
  81     `crlf`,
  82     `spaces`,
  83     `tabs`,
  84     `trails`,
  85     `nulls`,
  86     `fulls`,
  87     `highs`,
  88     `bom`,
  89 }
  90 
  91 // event has what the output-reporting task needs to show the results of a
  92 // task which has just completed, perhaps unsuccessfully
  93 type event struct {
  94     // Index points to the task's entry in the results-slice
  95     Index int
  96 
  97     // Stats has all the byte-related stats
  98     Stats stats
  99 
 100     // Err is the completed task's error, or lack of
 101     Err error
 102 }
 103 
 104 func main() {
 105     args := os.Args[1:]
 106 
 107     if len(args) > 0 {
 108         switch args[0] {
 109         case `-h`, `--h`, `-help`, `--help`:
 110             os.Stderr.WriteString(info[1:])
 111             return
 112 
 113         case `--`:
 114             args = args[1:]
 115         }
 116     }
 117 
 118     // show first/heading line right away, to let users know things are
 119     // happening
 120     for i, s := range header {
 121         if i > 0 {
 122             os.Stdout.WriteString("\t")
 123         }
 124         os.Stdout.WriteString(s)
 125     }
 126     // assume an error means later stages/apps in a pipe had enough input and
 127     // quit successfully, so quit successfully too
 128     _, err := os.Stdout.WriteString("\n")
 129     if err != nil {
 130         return
 131     }
 132 
 133     // names has all filepaths given, ignoring repetitions
 134     names, ok := findAllFiles(deduplicate(args))
 135     if !ok {
 136         os.Exit(1)
 137     }
 138     if len(names) == 0 {
 139         names = []string{`-`}
 140     }
 141 
 142     events := make(chan event)
 143     go handleInputs(names, events)
 144     if !handleOutput(os.Stdout, len(names), events) {
 145         os.Exit(1)
 146     }
 147 }
 148 
 149 type asyncArgs struct {
 150     Results chan event
 151 
 152     // Permissions limits how many worker tasks can be active at the same
 153     // time: when given many filepaths to work on, rate-limiting avoids
 154     // a massive number of concurrent tasks which read and process input
 155     Permissions chan struct{}
 156 
 157     // Tasks is to wait for all tasks to end before quitting the app
 158     Tasks *sync.WaitGroup
 159 }
 160 
 161 // handleInputs launches all the tasks which do the actual work, limiting how
 162 // many inputs are being worked on at the same time
 163 func handleInputs(names []string, events chan event) {
 164     var inputs sync.WaitGroup
 165     // the number of tasks is always known in advance
 166     inputs.Add(len(names))
 167 
 168     args := asyncArgs{
 169         Results:     events,
 170         Permissions: make(chan struct{}, runtime.NumCPU()),
 171         Tasks:       &inputs,
 172     }
 173 
 174     defer close(args.Results) // allow the output-reporter task to end
 175     defer close(args.Permissions)
 176 
 177     for i, name := range names {
 178         // wait until some concurrency-room is available, before proceeding
 179         args.Permissions <- struct{}{}
 180         go handleInputAsync(i, name, args)
 181     }
 182 
 183     // wait for all inputs, before closing the `events` channel, which in turn
 184     // would quit the whole app right away
 185     args.Tasks.Wait()
 186 }
 187 
 188 // handleInputAsync is the dispatched func used in func handleInputs
 189 func handleInputAsync(i int, name string, args asyncArgs) {
 190     defer args.Tasks.Done()
 191     defer func() { <-args.Permissions }()
 192     res, err := handleInput(name)
 193     args.Results <- event{Index: i, Stats: res, Err: err}
 194 }
 195 
 196 // handleInput handles each work-item for func handleInputs
 197 func handleInput(path string) (stats, error) {
 198     var res stats
 199     res.name = path
 200 
 201     if path == `-` {
 202         err := res.updateStats(os.Stdin)
 203         return res, err
 204     }
 205 
 206     f, err := os.Open(path)
 207     if err != nil {
 208         res.result = resultError
 209         // on windows, file-not-found error messages may mention `CreateFile`,
 210         // even when trying to open files in read-only mode
 211         return res, errors.New(`can't open file named ` + path)
 212     }
 213     defer f.Close()
 214 
 215     err = res.updateStats(f)
 216     return res, err
 217 }
 218 
 219 // handleOutput asynchronously updates output as results are known, whether
 220 // it's errors or successful results; returns whether it succeeded, which
 221 // means no errors happened
 222 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) {
 223     ok = true
 224     bw := bufio.NewWriter(w)
 225     defer bw.Flush()
 226 
 227     results := make([]stats, rescount)
 228 
 229     // keep track of which tasks are over, so that on each event all leading
 230     // results which are ready are shown: all of this ensures prompt output
 231     // updates as soon as results come in, while keeping the original order
 232     // of the names/filepaths given
 233     resultsLeft := results
 234 
 235     for v := range events {
 236         results[v.Index] = v.Stats
 237         if v.Err != nil {
 238             ok = false
 239             bw.Flush()
 240             showError(v.Err)
 241 
 242             // stay in the current loop, in case this failure was keeping
 243             // previous successes from showing up
 244         }
 245 
 246         for len(resultsLeft) > 0 {
 247             if resultsLeft[0].result == resultPending {
 248                 break
 249             }
 250 
 251             if err := showResult(bw, resultsLeft[0]); err != nil {
 252                 // assume later stages/apps in a pipe had enough input
 253                 return ok
 254             }
 255             resultsLeft = resultsLeft[1:]
 256         }
 257 
 258         // show leading results immediately, if any
 259         bw.Flush()
 260     }
 261 
 262     return ok
 263 }
 264 
 265 func showError(err error) {
 266     os.Stderr.WriteString(err.Error())
 267     os.Stderr.WriteString("\n")
 268 }
 269 
 270 // showResult does what it says
 271 func showResult(w *bufio.Writer, s stats) error {
 272     if s.result == resultError {
 273         return nil
 274     }
 275 
 276     var buf [64]byte
 277     w.WriteString(s.name)
 278     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10))
 279     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10))
 280     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10))
 281     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10))
 282     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10))
 283     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10))
 284     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10))
 285     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10))
 286     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10))
 287     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10))
 288     w.WriteByte('\t')
 289     w.WriteString(bomLegend[s.bom])
 290     return w.WriteByte('\n')
 291 }
 292 
 293 // deduplicate avoids repeating items, keeping the original slice unchanged
 294 func deduplicate(src []string) []string {
 295     var unique []string
 296     got := make(map[string]struct{})
 297 
 298     for _, s := range src {
 299         if _, ok := got[s]; ok {
 300             continue
 301         }
 302 
 303         unique = append(unique, s)
 304         got[s] = struct{}{}
 305     }
 306 
 307     return unique
 308 }
 309 
 310 // findAllFiles can be given a mix of file/folder paths, finding all files
 311 // recursively in folders, avoiding duplicates
 312 func findAllFiles(paths []string) (found []string, ok bool) {
 313     res := make(chan any)
 314     var all sync.WaitGroup
 315     all.Add(1)
 316 
 317     go func() {
 318         defer all.Done()
 319         got := make(map[string]struct{})
 320         ok = true
 321 
 322         for v := range res {
 323             if err, ok := v.(error); ok {
 324                 showError(err)
 325                 ok = false
 326                 continue
 327             }
 328 
 329             s, ok := v.(string)
 330             if !ok {
 331                 showError(errors.New(`value is neither string nor error`))
 332                 ok = false
 333                 continue
 334             }
 335 
 336             if _, ok := got[s]; ok {
 337                 continue
 338             }
 339 
 340             got[s] = struct{}{}
 341             found = append(found, s)
 342         }
 343     }()
 344 
 345     rec := func(path string, info fs.DirEntry, err error) error {
 346         if err != nil {
 347             res <- err
 348             return err
 349         }
 350 
 351         if info.IsDir() {
 352             return nil
 353         }
 354 
 355         res <- path
 356         return nil
 357     }
 358 
 359     for _, s := range paths {
 360         // a dash means standard input
 361         if s == `-` {
 362             res <- s
 363             continue
 364         }
 365 
 366         info, err := os.Stat(s)
 367         if os.IsNotExist(err) {
 368             // on windows, file-not-found messages may mention `CreateFile`,
 369             // even when trying to open files in read-only mode
 370             res <- errors.New(`can't find file/folder named ` + s)
 371             continue
 372         }
 373 
 374         if err != nil {
 375             res <- err
 376             continue
 377         }
 378 
 379         if !info.IsDir() {
 380             res <- s
 381             continue
 382         }
 383 
 384         if err := filepath.WalkDir(s, rec); err != nil {
 385             res <- err
 386         }
 387     }
 388 
 389     close(res)
 390     all.Wait()
 391 
 392     return found, ok
 393 }
 394 
 395 // counter makes it easy to change the int-size of almost all counters
 396 type counter uint64
 397 
 398 // statResult constrains possible result-states/values in type stats
 399 type statResult int
 400 
 401 const (
 402     // resultPending is the default not-yet-ready result-status
 403     resultPending = statResult(0)
 404 
 405     // resultError means result should show as an error, instead of data
 406     resultError = statResult(1)
 407 
 408     // resultSuccess means a result's stats are ready to show
 409     resultSuccess = statResult(2)
 410 )
 411 
 412 // bomType is the type for the byte-order-mark enumeration
 413 type bomType int
 414 
 415 const (
 416     noBOM      = bomType(0)
 417     utf8BOM    = bomType(1)
 418     utf16leBOM = bomType(2)
 419     utf16beBOM = bomType(3)
 420     utf32leBOM = bomType(4)
 421     utf32beBOM = bomType(5)
 422 )
 423 
 424 // bomLegend has the string-equivalents of the bomType constants
 425 var bomLegend = []string{
 426     ``,
 427     `UTF-8`,
 428     `UTF-16 LE`,
 429     `UTF-16 BE`,
 430     `UTF-32 LE`,
 431     `UTF-32 BE`,
 432 }
 433 
 434 // stats has all the size-stats for some input, as well as a way to
 435 // skip showing results, in case of an error such as `file not found`
 436 type stats struct {
 437     // bytes counts all bytes read
 438     bytes counter
 439 
 440     // lines counts lines, and is 0 only when the byte-count is also 0
 441     lines counter
 442 
 443     // maxWidth is maximum byte-width of lines, excluding carriage-returns
 444     // and/or line-feeds
 445     maxWidth counter
 446 
 447     // nulls counts all-bits-off bytes
 448     nulls counter
 449 
 450     // fulls counts all-bits-on bytes
 451     fulls counter
 452 
 453     // highs counts bytes with their `top` (highest-order) bit on
 454     highs counter
 455 
 456     // spaces counts ASCII spaces
 457     spaces counter
 458 
 459     // tabs counts ASCII tabs
 460     tabs counter
 461 
 462     // trailing counts lines with trailing spaces in them
 463     trailing counter
 464 
 465     // lf counts ASCII line-feeds as their own byte-values: this means its
 466     // value will always be at least the same as field `crlf`
 467     lf counter
 468 
 469     // crlf counts ASCII CRLF byte-pairs
 470     crlf counter
 471 
 472     // the type of byte-order mark detected
 473     bom bomType
 474 
 475     // name is the filepath of the file/source these stats are about
 476     name string
 477 
 478     // results keeps track of whether results are valid and/or ready
 479     result statResult
 480 }
 481 
 482 // updateStats does what it says, reading everything from a reader
 483 func (res *stats) updateStats(r io.Reader) error {
 484     err := res.updateUsing(r)
 485     if err == io.EOF {
 486         err = nil
 487     }
 488 
 489     if err == nil {
 490         res.result = resultSuccess
 491     } else {
 492         res.result = resultError
 493     }
 494     return err
 495 }
 496 
 497 func checkBOM(data []byte) bomType {
 498     d := data
 499     l := len(data)
 500 
 501     if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf {
 502         return utf8BOM
 503     }
 504     if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 {
 505         return utf32leBOM
 506     }
 507     if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff {
 508         return utf32beBOM
 509     }
 510     if l >= 2 && data[0] == 0xff && data[1] == 0xfe {
 511         return utf16leBOM
 512     }
 513     if l >= 2 && data[0] == 0xfe && data[1] == 0xff {
 514         return utf16beBOM
 515     }
 516 
 517     return noBOM
 518 }
 519 
 520 // updateUsing helps func updateStats do its job
 521 func (res *stats) updateUsing(r io.Reader) error {
 522     var buf [32 * 1024]byte
 523     var tallies [256]uint64
 524 
 525     var width counter
 526     var prev1, prev2 byte
 527 
 528     for {
 529         n, err := r.Read(buf[:])
 530         if n < 1 {
 531             res.lines = counter(tallies['\n'])
 532             res.tabs = counter(tallies['\t'])
 533             res.spaces = counter(tallies[' '])
 534             res.lf = counter(tallies['\n'])
 535             res.nulls = counter(tallies[0])
 536             res.fulls = counter(tallies[255])
 537             for i := 128; i < 256; i++ {
 538                 res.highs += counter(tallies[i])
 539             }
 540 
 541             if err == io.EOF {
 542                 return res.handleEnd(width, prev1, prev2)
 543             }
 544             return err
 545         }
 546 
 547         chunk := buf[:n]
 548         if res.bytes == 0 {
 549             res.bom = checkBOM(chunk)
 550         }
 551         res.bytes += counter(n)
 552 
 553         for _, b := range chunk {
 554             // count values without branching, because it's fun
 555             tallies[b]++
 556 
 557             if b != '\n' {
 558                 prev2 = prev1
 559                 prev1 = b
 560                 width++
 561                 continue
 562             }
 563 
 564             // handle line-feeds
 565 
 566             crlf := count(prev1, '\r')
 567             res.crlf += crlf
 568 
 569             // count lines with trailing spaces, whether these end with
 570             // a CRLF byte-pair or just a line-feed byte
 571             if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 572                 res.trailing++
 573             }
 574 
 575             // exclude any CR from the current line's width-count
 576             width -= crlf
 577             if res.maxWidth < width {
 578                 res.maxWidth = width
 579             }
 580 
 581             prev2 = prev1
 582             prev1 = b
 583             width = 0
 584         }
 585     }
 586 }
 587 
 588 // handleEnd fixes/finalizes stats when input data end; this func is only
 589 // meant to be used by func updateStats, since it takes some of the latter's
 590 // local variables
 591 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 592     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 593         res.trailing++
 594     }
 595 
 596     if res.maxWidth < width {
 597         res.maxWidth = width
 598     }
 599 
 600     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 601     // standard cmd-line tool `wc`
 602     if res.bytes > 0 && prev1 != '\n' {
 603         res.lines++
 604     }
 605 
 606     return nil
 607 }
 608 
 609 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 610 // be added directly/branchlessly to totals
 611 func count(x, y byte) counter {
 612     var c counter
 613     if x == y {
 614         c = 1
 615     } else {
 616         c = 0
 617     }
 618     return c
 619 }