File: coby.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 Single-file source-code for coby: this version has no http(s) support. Even
  27 the unit-tests from the original coby are omitted.
  28 
  29 To compile a smaller-sized command-line app, you can use the `go` command as
  30 follows:
  31 
  32 go build -ldflags "-s -w" -trimpath coby.go
  33 */
  34 
  35 package main
  36 
  37 import (
  38     "bufio"
  39     "errors"
  40     "io"
  41     "io/fs"
  42     "os"
  43     "path/filepath"
  44     "runtime"
  45     "strconv"
  46     "sync"
  47 )
  48 
  49 const info = `
  50 coby [files/folders...]
  51 
  52 
  53 COunt BYtes finds out some simple byte-related stats, counting
  54 
  55     - bytes
  56     - lines
  57     - how many lines have trailing spaces
  58     - how many lines end with a CRLF pair
  59     - all-off (0) bytes
  60     - all-on (255) bytes
  61     - high-bytes (128+)
  62     - which (if any) byte-order mark the data start with
  63 
  64 The output is TSV (tab-separated values) lines, where the first line has
  65 all the column names.
  66 
  67 When no filepaths are given, the standard input is used by default. All
  68 folder names given expand recursively into all filenames in them.
  69 `
  70 
  71 // header is the first output line
  72 var header = []string{
  73     `name`,
  74     `bytes`,
  75     `runes`,
  76     `lines`,
  77     `lf`,
  78     `crlf`,
  79     `spaces`,
  80     `tabs`,
  81     `trails`,
  82     `nulls`,
  83     `fulls`,
  84     `highs`,
  85     `bom`,
  86 }
  87 
  88 // event has what the output-reporting task needs to show the results of a
  89 // task which has just completed, perhaps unsuccessfully
  90 type event struct {
  91     // Index points to the task's entry in the results-slice
  92     Index int
  93 
  94     // Stats has all the byte-related stats
  95     Stats stats
  96 
  97     // Err is the completed task's error, or lack of
  98     Err error
  99 }
 100 
 101 func main() {
 102     if len(os.Args) > 1 {
 103         switch os.Args[1] {
 104         case `-h`, `--h`, `-help`, `--help`:
 105             os.Stderr.WriteString(info[1:])
 106             return
 107         }
 108     }
 109 
 110     // show first/heading line right away, to let users know things are
 111     // happening
 112     for i, s := range header {
 113         if i > 0 {
 114             os.Stdout.WriteString("\t")
 115         }
 116         os.Stdout.WriteString(s)
 117     }
 118     // assume an error means later stages/apps in a pipe had enough input and
 119     // quit successfully, so quit successfully too
 120     _, err := os.Stdout.WriteString("\n")
 121     if err != nil {
 122         return
 123     }
 124 
 125     // names has all filepaths given, ignoring repetitions
 126     names, ok := findAllFiles(unique(os.Args[1:]))
 127     if !ok {
 128         os.Exit(1)
 129     }
 130     if len(names) == 0 {
 131         names = []string{`-`}
 132     }
 133 
 134     events := make(chan event)
 135     go handleInputs(names, events)
 136     if !handleOutput(os.Stdout, len(names), events) {
 137         os.Exit(1)
 138     }
 139 }
 140 
 141 // handleInputs launches all the tasks which do the actual work, limiting how
 142 // many inputs are being worked on at the same time
 143 func handleInputs(names []string, events chan event) {
 144     // allow output-reporter task to end, and thus the app
 145     defer close(events)
 146 
 147     // permissions limits how many worker tasks can be active at the same
 148     // time: when given many filepaths to work on, rate-limiting avoids
 149     // a massive number of concurrent tasks which read and process input
 150     permissions := make(chan struct{}, runtime.NumCPU())
 151     defer close(permissions)
 152 
 153     var inputs sync.WaitGroup
 154     for i := range names {
 155         // wait until some concurrency-room is available
 156         permissions <- struct{}{}
 157         inputs.Add(1)
 158 
 159         go func(i int) {
 160             defer inputs.Done()
 161             res, err := handleInput(names[i])
 162             events <- event{i, res, err}
 163             <-permissions
 164         }(i)
 165     }
 166 
 167     // wait for all inputs, before closing the `events` channel
 168     inputs.Wait()
 169 }
 170 
 171 // handleInput handles each work-item for func handleInputs
 172 func handleInput(path string) (stats, error) {
 173     var res stats
 174     res.name = path
 175 
 176     if path == `-` {
 177         err := res.updateStats(os.Stdin)
 178         return res, err
 179     }
 180 
 181     f, err := os.Open(path)
 182     if err != nil {
 183         res.result = resultError
 184         // on windows, file-not-found error messages may mention `CreateFile`,
 185         // even when trying to open files in read-only mode
 186         return res, errors.New(`can't open file named ` + path)
 187     }
 188     defer f.Close()
 189 
 190     err = res.updateStats(f)
 191     return res, err
 192 }
 193 
 194 // handleOutput asynchronously updates output as results are known, whether
 195 // it's errors or successful results; returns whether it succeeded, which
 196 // means no errors happened
 197 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) {
 198     ok = true
 199     bw := bufio.NewWriter(w)
 200     defer bw.Flush()
 201 
 202     results := make([]stats, rescount)
 203 
 204     // keep track of which tasks are over, so that on each event all leading
 205     // results which are ready are shown: all of this ensures prompt output
 206     // updates as soon as results come in, while keeping the original order
 207     // of the names/filepaths given
 208     resultsLeft := results
 209 
 210     for v := range events {
 211         results[v.Index] = v.Stats
 212         if v.Err != nil {
 213             ok = false
 214             bw.Flush()
 215             showError(v.Err)
 216 
 217             // stay in the current loop, in case this failure was keeping
 218             // previous successes from showing up
 219         }
 220 
 221         n := countLeadingReady(resultsLeft)
 222 
 223         for _, res := range resultsLeft[:n] {
 224             if err := showResult(bw, res); err != nil {
 225                 // assume later stages/apps in a pipe had enough input and
 226                 // quit successfully, so quit successfully too
 227                 return true
 228             }
 229         }
 230         resultsLeft = resultsLeft[n:]
 231 
 232         // flush output-buffer only if anything new was shown
 233         if n > 0 {
 234             bw.Flush()
 235         }
 236     }
 237 
 238     return ok
 239 }
 240 
 241 // showError standardizes how errors from this app look
 242 func showError(err error) {
 243     os.Stderr.WriteString("\x1b[31m")
 244     os.Stderr.WriteString(err.Error())
 245     os.Stderr.WriteString("\x1b[0m\n")
 246 }
 247 
 248 // showResult does what it says
 249 func showResult(w *bufio.Writer, res stats) error {
 250     if res.result == resultError {
 251         return nil
 252     }
 253 
 254     var buf [64]byte
 255     w.WriteString(res.name)
 256     w.WriteByte('\t')
 257     w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10))
 258     w.WriteByte('\t')
 259     w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10))
 260     w.WriteByte('\t')
 261     w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10))
 262     w.WriteByte('\t')
 263     w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10))
 264     w.WriteByte('\t')
 265     w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10))
 266     w.WriteByte('\t')
 267     w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10))
 268     w.WriteByte('\t')
 269     w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10))
 270     w.WriteByte('\t')
 271     w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10))
 272     w.WriteByte('\t')
 273     w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10))
 274     w.WriteByte('\t')
 275     w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10))
 276     w.WriteByte('\t')
 277     w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10))
 278     w.WriteByte('\t')
 279     w.WriteString(bomLegend[res.bom])
 280     return w.WriteByte('\n')
 281 }
 282 
 283 // unique ensures items only appear once in the result, keeping the original
 284 // slice unchanged
 285 func unique(src []string) []string {
 286     var unique []string
 287     got := make(map[string]struct{})
 288     for _, s := range src {
 289         if _, ok := got[s]; ok {
 290             continue
 291         }
 292         unique = append(unique, s)
 293         got[s] = struct{}{}
 294     }
 295     return unique
 296 }
 297 
 298 // findAllFiles does what it says, given a mix of file/folder paths, finding
 299 // all files recursively in the case of folders
 300 func findAllFiles(paths []string) (found []string, ok bool) {
 301     var unique []string
 302     got := make(map[string]struct{})
 303     ok = true
 304 
 305     for _, root := range paths {
 306         // a dash means standard input
 307         if root == `-` {
 308             if _, ok := got[root]; ok {
 309                 continue
 310             }
 311 
 312             unique = append(unique, root)
 313             got[root] = struct{}{}
 314             continue
 315         }
 316 
 317         _, err := os.Stat(root)
 318         if os.IsNotExist(err) {
 319             ok = false
 320             // on windows, file-not-found error messages may mention `CreateFile`,
 321             // even when trying to open files in read-only mode
 322             err := errors.New(`can't find file/folder named ` + root)
 323             showError(err)
 324             continue
 325         }
 326 
 327         err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
 328             if err != nil {
 329                 return err
 330             }
 331 
 332             if d.IsDir() {
 333                 return nil
 334             }
 335 
 336             if _, ok := got[path]; ok {
 337                 return nil
 338             }
 339 
 340             unique = append(unique, path)
 341             got[path] = struct{}{}
 342             return nil
 343         })
 344 
 345         if err != nil {
 346             ok = false
 347             showError(err)
 348         }
 349     }
 350 
 351     return unique, ok
 352 }
 353 
 354 // counter makes it easy to change the int-size of almost all counters
 355 type counter int
 356 
 357 // statResult constrains possible result-states/values in type stats
 358 type statResult int
 359 
 360 const (
 361     // resultPending is the default not-yet-ready result-status
 362     resultPending = statResult(0)
 363 
 364     // resultError means result should show as an error, instead of data
 365     resultError = statResult(1)
 366 
 367     // resultSuccess means result can be shown
 368     resultSuccess = statResult(2)
 369 )
 370 
 371 type bomType int
 372 
 373 const (
 374     noBOM      = bomType(0)
 375     utf8BOM    = bomType(1)
 376     utf16leBOM = bomType(2)
 377     utf16beBOM = bomType(3)
 378     utf32leBOM = bomType(4)
 379     utf32beBOM = bomType(5)
 380 )
 381 
 382 // bomLegend has the string-equivalents of the `bomType` constants
 383 var bomLegend = []string{
 384     ``,
 385     `UTF-8`,
 386     `UTF-16 LE`,
 387     `UTF-16 BE`,
 388     `UTF-32 LE`,
 389     `UTF-32 BE`,
 390 }
 391 
 392 // stats has all the size-stats for some input, as well as a way to
 393 // skip showing results, in case of an error such as `file not found`
 394 type stats struct {
 395     // bytes counts all bytes read
 396     bytes int
 397 
 398     // lines counts lines, and is 0 only when the byte-count is also 0
 399     lines counter
 400 
 401     // runes counts utf-8 sequences, each of which can use up to 4 bytes and
 402     // is usually a complete symbol: `emoji` country-flags are commonly-used
 403     // counter-examples, as these `symbols` need 2 runes, using 8 bytes each
 404     runes counter
 405 
 406     // maxWidth is maximum byte-width of lines, excluding carriage-returns
 407     // and/or line-feeds
 408     maxWidth counter
 409 
 410     // nulls counts all-bits-off bytes
 411     nulls counter
 412 
 413     // fulls counts all-bits-on bytes
 414     fulls counter
 415 
 416     // highs counts bytes with their `top` (highest-order) bit on
 417     highs counter
 418 
 419     // spaces counts ASCII spaces
 420     spaces counter
 421 
 422     // tabs counts ASCII tabs
 423     tabs counter
 424 
 425     // trailing counts lines with trailing spaces in them
 426     trailing counter
 427 
 428     // lf counts ASCII line-feeds as their own byte-values: this means its
 429     // value will always be at least the same as field `crlf`
 430     lf counter
 431 
 432     // crlf counts ASCII CRLF byte-pairs
 433     crlf counter
 434 
 435     // the type of byte-order mark detected
 436     bom bomType
 437 
 438     // name is the filepath of the file/source these stats are about
 439     name string
 440 
 441     // results keeps track of whether results are valid and/or ready
 442     result statResult
 443 }
 444 
 445 // updateStats does what it says, reading everything from a reader
 446 func (res *stats) updateStats(r io.Reader) error {
 447     err := res.updateUsing(r)
 448     if err == io.EOF {
 449         err = nil
 450     }
 451 
 452     if err == nil {
 453         res.result = resultSuccess
 454     } else {
 455         res.result = resultError
 456     }
 457     return err
 458 }
 459 
 460 func checkBOM(data []byte) bomType {
 461     d := data
 462     l := len(data)
 463 
 464     if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf {
 465         return utf8BOM
 466     }
 467     if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 {
 468         return utf32leBOM
 469     }
 470     if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff {
 471         return utf32beBOM
 472     }
 473     if l >= 2 && data[0] == 0xff && data[1] == 0xfe {
 474         return utf16leBOM
 475     }
 476     if l >= 2 && data[0] == 0xfe && data[1] == 0xff {
 477         return utf16beBOM
 478     }
 479 
 480     return noBOM
 481 }
 482 
 483 // updateUsing helps func updateStats do its job
 484 func (res *stats) updateUsing(r io.Reader) error {
 485     var buf [32 * 1024]byte
 486     var tallies [256]uint64
 487 
 488     var width counter
 489     var prev1, prev2 byte
 490 
 491     for {
 492         n, err := r.Read(buf[:])
 493         if n < 1 {
 494             res.lines = counter(tallies['\n'])
 495             res.tabs = counter(tallies['\t'])
 496             res.spaces = counter(tallies[' '])
 497             res.lf = counter(tallies['\n'])
 498             res.nulls = counter(tallies[0])
 499             res.fulls = counter(tallies[255])
 500             for i := 128; i < 256; i++ {
 501                 res.highs += counter(tallies[i])
 502             }
 503 
 504             if err == io.EOF {
 505                 return res.handleEnd(width, prev1, prev2)
 506             }
 507             return err
 508         }
 509 
 510         chunk := buf[:n]
 511         if res.bytes == 0 {
 512             res.bom = checkBOM(chunk)
 513         }
 514         res.bytes += n
 515 
 516         for _, b := range chunk {
 517             // count values without branching, because it's fun
 518             tallies[b]++
 519 
 520             // handle non-ASCII runes, assuming input is valid UTF-8
 521             res.runes += 1 - count(b&0xc0, 0x80)
 522 
 523             // handle line-feeds
 524             if b == '\n' {
 525                 crlf := count(prev1, '\r')
 526                 res.crlf += crlf
 527 
 528                 // count lines with trailing spaces, whether these end with
 529                 // a CRLF byte-pair or just a line-feed byte
 530                 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 531                     res.trailing++
 532                 }
 533 
 534                 // exclude any CR from the current line's width-count
 535                 width -= crlf
 536                 if res.maxWidth < width {
 537                     res.maxWidth = width
 538                 }
 539 
 540                 prev2 = prev1
 541                 prev1 = b
 542                 width = 0
 543                 continue
 544             }
 545 
 546             prev2 = prev1
 547             prev1 = b
 548             width++
 549         }
 550     }
 551 }
 552 
 553 // handleEnd fixes/finalizes stats when input data end; this func is only
 554 // meant to be used by func updateStats, since it takes some of the latter's
 555 // local variables
 556 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 557     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 558         res.trailing++
 559     }
 560 
 561     if res.maxWidth < width {
 562         res.maxWidth = width
 563     }
 564 
 565     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 566     // standard cmd-line tool `wc`
 567     if res.bytes > 0 && prev1 != '\n' {
 568         res.lines++
 569     }
 570 
 571     return nil
 572 }
 573 
 574 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 575 // be added directly/branchlessly to totals
 576 func count(x, y byte) counter {
 577     if x != y {
 578         return 0
 579     }
 580     return 1
 581 }
 582 
 583 // countLeadingReady finds how many items are ready to show at the start of a
 584 // results-slice, which ensures output matches the original item-order
 585 func countLeadingReady(values []stats) int {
 586     for i, v := range values {
 587         if v.result == resultPending {
 588             return i
 589         }
 590     }
 591     return len(values)
 592 }