File: coby.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath coby.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "errors"
  37     "io"
  38     "io/fs"
  39     "os"
  40     "path/filepath"
  41     "runtime"
  42     "strconv"
  43     "sync"
  44 )
  45 
  46 const info = `
  47 coby [options...] [files/folders...]
  48 
  49 
  50 COunt BYtes finds out some simple byte-related stats, counting
  51 
  52     - bytes
  53     - lines
  54     - how many lines have trailing spaces (trails)
  55     - how many lines end with a CRLF pair
  56     - all-bits-off (null) bytes
  57     - all-bits-on (full) bytes
  58     - top-bit-on (high) bytes
  59     - which unicode byte-order-mark (bom) sequence the data start with
  60 
  61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text
  62 data, and thus may not be meaningful for general binary data.
  63 
  64 The output is TSV (tab-separated values) lines, where the first line has
  65 all the column names.
  66 
  67 When no filepaths are given, the standard input is used by default. All
  68 folder names given expand recursively into all filenames in them. A mix
  69 of files/folders is supported for convenience.
  70 
  71 The only option available is to show this help message, using any of
  72 "-h", "--h", "-help", or "--help", without the quotes.
  73 `
  74 
  75 // header has all the values for the first output line
  76 var header = []string{
  77     `name`,
  78     `bytes`,
  79     `lines`,
  80     `lf`,
  81     `crlf`,
  82     `spaces`,
  83     `tabs`,
  84     `trails`,
  85     `nulls`,
  86     `fulls`,
  87     `highs`,
  88     `bom`,
  89 }
  90 
  91 // event has what the output-reporting task needs to show the results of a
  92 // task which has just completed, perhaps unsuccessfully
  93 type event struct {
  94     // Index points to the task's entry in the results-slice
  95     Index int
  96 
  97     // Stats has all the byte-related stats
  98     Stats stats
  99 
 100     // Err is the completed task's error, or lack of
 101     Err error
 102 }
 103 
 104 func main() {
 105     args := os.Args[1:]
 106 
 107     if len(args) > 0 {
 108         switch args[0] {
 109         case `-h`, `--h`, `-help`, `--help`:
 110             os.Stdout.WriteString(info[1:])
 111             return
 112 
 113         case `--`:
 114             args = args[1:]
 115         }
 116     }
 117 
 118     // show first/heading line right away, to let users know things are
 119     // happening
 120     for i, s := range header {
 121         if i > 0 {
 122             os.Stdout.WriteString("\t")
 123         }
 124         os.Stdout.WriteString(s)
 125     }
 126     // assume an error means later stages/apps in a pipe had enough input and
 127     // quit successfully, so quit successfully too
 128     _, err := os.Stdout.WriteString("\n")
 129     if err != nil {
 130         return
 131     }
 132 
 133     // names has all filepaths given, ignoring repetitions
 134     names, ok := findAllFiles(args)
 135     if !ok {
 136         os.Exit(1)
 137         return
 138     }
 139     if len(names) == 0 {
 140         names = []string{`-`}
 141     }
 142 
 143     events := make(chan event)
 144     go handleInputs(names, events)
 145     if !handleOutput(os.Stdout, len(names), events) {
 146         os.Exit(1)
 147         return
 148     }
 149 }
 150 
 151 // handleInputs launches all the tasks which do the actual work, limiting how
 152 // many inputs are being worked on at the same time
 153 func handleInputs(names []string, events chan<- event) {
 154     defer close(events) // allow the output-reporter task to end
 155 
 156     var tasks sync.WaitGroup
 157     // the number of tasks is always known in advance
 158     tasks.Add(len(names))
 159 
 160     // permissions is buffered to limit concurrency to the core-count
 161     permissions := make(chan struct{}, runtime.NumCPU())
 162     defer close(permissions)
 163 
 164     for i, name := range names {
 165         // wait until some concurrency-room is available, before proceeding
 166         permissions <- struct{}{}
 167 
 168         go func(i int, name string) {
 169             defer tasks.Done()
 170 
 171             res, err := handleInput(name)
 172             <-permissions
 173             events <- event{Index: i, Stats: res, Err: err}
 174         }(i, name)
 175     }
 176 
 177     // wait for all inputs, before closing the `events` channel, which in turn
 178     // would quit the whole app right away
 179     tasks.Wait()
 180 }
 181 
 182 // handleInput handles each work-item for func handleInputs
 183 func handleInput(path string) (stats, error) {
 184     var res stats
 185     res.name = path
 186 
 187     if path == `-` {
 188         err := res.updateStats(os.Stdin)
 189         return res, err
 190     }
 191 
 192     f, err := os.Open(path)
 193     if err != nil {
 194         res.result = resultError
 195         // on windows, file-not-found error messages may mention `CreateFile`,
 196         // even when trying to open files in read-only mode
 197         return res, errors.New(`can't open file named ` + path)
 198     }
 199     defer f.Close()
 200 
 201     err = res.updateStats(f)
 202     return res, err
 203 }
 204 
 205 // handleOutput asynchronously updates output as results are known, whether
 206 // it's errors or successful results; returns whether it succeeded, which
 207 // means no errors happened
 208 func handleOutput(w io.Writer, inputs int, events <-chan event) (ok bool) {
 209     bw := bufio.NewWriter(w)
 210     defer bw.Flush()
 211 
 212     ok = true
 213     results := make([]stats, inputs)
 214 
 215     // keep track of which tasks are over, so that on each event all leading
 216     // results which are ready are shown: all of this ensures prompt output
 217     // updates as soon as results come in, while keeping the original order
 218     // of the names/filepaths given
 219     resultsLeft := results
 220 
 221     for v := range events {
 222         results[v.Index] = v.Stats
 223         if v.Err != nil {
 224             ok = false
 225             bw.Flush()
 226             showError(v.Err)
 227 
 228             // stay in the current loop, in case this failure was keeping
 229             // previous successes from showing up
 230         }
 231 
 232         for len(resultsLeft) > 0 {
 233             if resultsLeft[0].result == resultPending {
 234                 break
 235             }
 236 
 237             if err := showResult(bw, resultsLeft[0]); err != nil {
 238                 // assume later stages/apps in a pipe had enough input
 239                 return ok
 240             }
 241             resultsLeft = resultsLeft[1:]
 242         }
 243 
 244         // show leading results immediately, if any
 245         bw.Flush()
 246     }
 247 
 248     return ok
 249 }
 250 
 251 func showError(err error) {
 252     os.Stderr.WriteString(err.Error())
 253     os.Stderr.WriteString("\n")
 254 }
 255 
 256 // showResult shows a TSV line for results marked as successful, doing nothing
 257 // when given other types of results
 258 func showResult(w *bufio.Writer, s stats) error {
 259     if s.result != resultSuccess {
 260         return nil
 261     }
 262 
 263     var buf [24]byte
 264     w.WriteString(s.name)
 265     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10))
 266     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10))
 267     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10))
 268     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10))
 269     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10))
 270     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10))
 271     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10))
 272     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10))
 273     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10))
 274     w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10))
 275     w.WriteByte('\t')
 276     w.WriteString(bomLegend[s.bom])
 277     return w.WriteByte('\n')
 278 }
 279 
 280 // findAllFiles can be given a mix of file/folder paths, finding all files
 281 // recursively in folders, avoiding duplicates
 282 func findAllFiles(paths []string) (files []string, success bool) {
 283     walk := filepath.WalkDir
 284     got := make(map[string]struct{})
 285     success = true
 286 
 287     for _, path := range paths {
 288         if _, ok := got[path]; ok {
 289             continue
 290         }
 291         got[path] = struct{}{}
 292 
 293         // a dash means standard input
 294         if path == `-` {
 295             files = append(files, path)
 296             continue
 297         }
 298 
 299         info, err := os.Stat(path)
 300         if os.IsNotExist(err) {
 301             // on windows, file-not-found messages may mention `CreateFile`,
 302             // even when trying to open files in read-only mode
 303             err = errors.New(`can't find file/folder named ` + path)
 304         }
 305 
 306         if err != nil {
 307             showError(err)
 308             success = false
 309             continue
 310         }
 311 
 312         if !info.IsDir() {
 313             files = append(files, path)
 314             continue
 315         }
 316 
 317         err = walk(path, func(path string, info fs.DirEntry, err error) error {
 318             path, err = filepath.Abs(path)
 319             if err != nil {
 320                 showError(err)
 321                 success = false
 322                 return err
 323             }
 324 
 325             if _, ok := got[path]; ok {
 326                 if info.IsDir() {
 327                     return fs.SkipDir
 328                 }
 329                 return nil
 330             }
 331             got[path] = struct{}{}
 332 
 333             if err != nil {
 334                 showError(err)
 335                 success = false
 336                 return err
 337             }
 338 
 339             if info.IsDir() {
 340                 return nil
 341             }
 342 
 343             files = append(files, path)
 344             return nil
 345         })
 346 
 347         if err != nil {
 348             showError(err)
 349             success = false
 350         }
 351     }
 352 
 353     return files, success
 354 }
 355 
 356 // counter makes it easy to change the int-size of almost all counters
 357 type counter uint64
 358 
 359 // statResult constrains possible result-states/values in type stats
 360 type statResult int
 361 
 362 const (
 363     // resultPending is the default not-yet-ready result-status
 364     resultPending = statResult(0)
 365 
 366     // resultError means result should show as an error, instead of data
 367     resultError = statResult(1)
 368 
 369     // resultSuccess means a result's stats are ready to show
 370     resultSuccess = statResult(2)
 371 )
 372 
 373 // bomType is the type for the byte-order-mark enumeration
 374 type bomType int
 375 
 376 const (
 377     noBOM      = bomType(0)
 378     utf8BOM    = bomType(1)
 379     utf16leBOM = bomType(2)
 380     utf16beBOM = bomType(3)
 381     utf32leBOM = bomType(4)
 382     utf32beBOM = bomType(5)
 383 )
 384 
 385 // bomLegend has the string-equivalents of the bomType constants
 386 var bomLegend = []string{
 387     ``,
 388     `UTF-8`,
 389     `UTF-16 LE`,
 390     `UTF-16 BE`,
 391     `UTF-32 LE`,
 392     `UTF-32 BE`,
 393 }
 394 
 395 // stats has all the size-stats for some input, as well as a way to
 396 // skip showing results, in case of an error such as `file not found`
 397 type stats struct {
 398     // bytes counts all bytes read
 399     bytes counter
 400 
 401     // lines counts lines, and is 0 only when the byte-count is also 0
 402     lines counter
 403 
 404     // maxWidth is maximum byte-width of lines, excluding carriage-returns
 405     // and/or line-feeds
 406     maxWidth counter
 407 
 408     // nulls counts all-bits-off bytes
 409     nulls counter
 410 
 411     // fulls counts all-bits-on bytes
 412     fulls counter
 413 
 414     // highs counts bytes with their `top` (highest-order) bit on
 415     highs counter
 416 
 417     // spaces counts ASCII spaces
 418     spaces counter
 419 
 420     // tabs counts ASCII tabs
 421     tabs counter
 422 
 423     // trailing counts lines with trailing spaces in them
 424     trailing counter
 425 
 426     // lf counts ASCII line-feeds as their own byte-values: this means its
 427     // value will always be at least the same as field `crlf`
 428     lf counter
 429 
 430     // crlf counts ASCII CRLF byte-pairs
 431     crlf counter
 432 
 433     // the type of byte-order mark detected
 434     bom bomType
 435 
 436     // name is the filepath of the file/source these stats are about
 437     name string
 438 
 439     // results keeps track of whether results are valid and/or ready
 440     result statResult
 441 }
 442 
 443 // updateStats does what it says, reading everything from a reader
 444 func (res *stats) updateStats(r io.Reader) error {
 445     err := res.updateUsing(r)
 446     if err == io.EOF {
 447         err = nil
 448     }
 449 
 450     if err == nil {
 451         res.result = resultSuccess
 452     } else {
 453         res.result = resultError
 454     }
 455     return err
 456 }
 457 
 458 func checkBOM(data []byte) bomType {
 459     d := data
 460     l := len(data)
 461 
 462     if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf {
 463         return utf8BOM
 464     }
 465     if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 {
 466         return utf32leBOM
 467     }
 468     if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff {
 469         return utf32beBOM
 470     }
 471     if l >= 2 && data[0] == 0xff && data[1] == 0xfe {
 472         return utf16leBOM
 473     }
 474     if l >= 2 && data[0] == 0xfe && data[1] == 0xff {
 475         return utf16beBOM
 476     }
 477 
 478     return noBOM
 479 }
 480 
 481 // updateUsing helps func updateStats do its job
 482 func (res *stats) updateUsing(r io.Reader) error {
 483     var buf [32 * 1024]byte
 484     var tallies [256]uint64
 485 
 486     var width counter
 487     var prev1, prev2 byte
 488 
 489     for {
 490         n, err := r.Read(buf[:])
 491         if n < 1 {
 492             res.lines = counter(tallies['\n'])
 493             res.tabs = counter(tallies['\t'])
 494             res.spaces = counter(tallies[' '])
 495             res.lf = counter(tallies['\n'])
 496             res.nulls = counter(tallies[0])
 497             res.fulls = counter(tallies[255])
 498             for i := 128; i < len(tallies); i++ {
 499                 res.highs += counter(tallies[i])
 500             }
 501 
 502             if err == io.EOF {
 503                 return res.handleEnd(width, prev1, prev2)
 504             }
 505             return err
 506         }
 507 
 508         chunk := buf[:n]
 509         if res.bytes == 0 {
 510             res.bom = checkBOM(chunk)
 511         }
 512         res.bytes += counter(n)
 513 
 514         for _, b := range chunk {
 515             // count values without branching, because it's fun
 516             tallies[b]++
 517 
 518             if b != '\n' {
 519                 prev2 = prev1
 520                 prev1 = b
 521                 width++
 522                 continue
 523             }
 524 
 525             // handle line-feeds
 526 
 527             crlf := count(prev1, '\r')
 528             res.crlf += crlf
 529 
 530             // count lines with trailing spaces, whether these end with
 531             // a CRLF byte-pair or just a line-feed byte
 532             if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 533                 res.trailing++
 534             }
 535 
 536             // exclude any CR from the current line's width-count
 537             width -= crlf
 538             if res.maxWidth < width {
 539                 res.maxWidth = width
 540             }
 541 
 542             prev2 = prev1
 543             prev1 = b
 544             width = 0
 545         }
 546     }
 547 }
 548 
 549 // handleEnd fixes/finalizes stats when input data end; this func is only
 550 // meant to be used by func updateStats, since it takes some of the latter's
 551 // local variables
 552 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 553     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 554         res.trailing++
 555     }
 556 
 557     if res.maxWidth < width {
 558         res.maxWidth = width
 559     }
 560 
 561     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 562     // standard cmd-line tool `wc`
 563     if res.bytes > 0 && prev1 != '\n' {
 564         res.lines++
 565     }
 566 
 567     return nil
 568 }
 569 
 570 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 571 // be added directly/branchlessly to totals
 572 func count(x, y byte) counter {
 573     var c counter
 574     if x == y {
 575         c = 1
 576     } else {
 577         c = 0
 578     }
 579     return c
 580 }