File: coby.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 Single-file source-code for coby: this version has no http(s) support. Even
  27 the unit-tests from the original coby are omitted.
  28 
  29 To compile a smaller-sized command-line app, you can use the `go` command as
  30 follows:
  31 
  32 go build -ldflags "-s -w" -trimpath coby.go
  33 */
  34 
  35 package main
  36 
  37 import (
  38     "bufio"
  39     "errors"
  40     "io"
  41     "io/fs"
  42     "os"
  43     "path/filepath"
  44     "runtime"
  45     "strconv"
  46     "sync"
  47 )
  48 
  49 const info = `
  50 coby [files/folders...]
  51 
  52 
  53 COunt BYtes finds out some simple byte-related stats, counting
  54 
  55     - bytes
  56     - lines
  57     - how many lines have trailing spaces
  58     - how many lines end with a CRLF pair
  59     - all-off (0) bytes
  60     - all-on (255) bytes
  61     - high-bytes (128+)
  62     - which (if any) byte-order mark the data start with
  63 
  64 The output is TSV (tab-separated values) lines, where the first line has
  65 all the column names.
  66 
  67 When no filepaths are given, the standard input is used by default. All
  68 folder names given expand recursively into all filenames in them.
  69 `
  70 
  71 // header is the first output line
  72 var header = []string{
  73     `name`,
  74     `bytes`,
  75     `lines`,
  76     `lf`,
  77     `crlf`,
  78     `spaces`,
  79     `tabs`,
  80     `trails`,
  81     `nulls`,
  82     `fulls`,
  83     `highs`,
  84     `bom`,
  85 }
  86 
  87 // event has what the output-reporting task needs to show the results of a
  88 // task which has just completed, perhaps unsuccessfully
  89 type event struct {
  90     // Index points to the task's entry in the results-slice
  91     Index int
  92 
  93     // Stats has all the byte-related stats
  94     Stats stats
  95 
  96     // Err is the completed task's error, or lack of
  97     Err error
  98 }
  99 
 100 func main() {
 101     if len(os.Args) > 1 {
 102         switch os.Args[1] {
 103         case `-h`, `--h`, `-help`, `--help`:
 104             os.Stderr.WriteString(info[1:])
 105             return
 106         }
 107     }
 108 
 109     // show first/heading line right away, to let users know things are
 110     // happening
 111     for i, s := range header {
 112         if i > 0 {
 113             os.Stdout.WriteString("\t")
 114         }
 115         os.Stdout.WriteString(s)
 116     }
 117     // assume an error means later stages/apps in a pipe had enough input and
 118     // quit successfully, so quit successfully too
 119     _, err := os.Stdout.WriteString("\n")
 120     if err != nil {
 121         return
 122     }
 123 
 124     // names has all filepaths given, ignoring repetitions
 125     names, ok := findAllFiles(unique(os.Args[1:]))
 126     if !ok {
 127         os.Exit(1)
 128     }
 129     if len(names) == 0 {
 130         names = []string{`-`}
 131     }
 132 
 133     events := make(chan event)
 134     go handleInputs(names, events)
 135     if !handleOutput(os.Stdout, len(names), events) {
 136         os.Exit(1)
 137     }
 138 }
 139 
 140 // handleInputs launches all the tasks which do the actual work, limiting how
 141 // many inputs are being worked on at the same time
 142 func handleInputs(names []string, events chan event) {
 143     // allow output-reporter task to end, and thus the app
 144     defer close(events)
 145 
 146     // permissions limits how many worker tasks can be active at the same
 147     // time: when given many filepaths to work on, rate-limiting avoids
 148     // a massive number of concurrent tasks which read and process input
 149     permissions := make(chan struct{}, runtime.NumCPU())
 150     defer close(permissions)
 151 
 152     var inputs sync.WaitGroup
 153     for i := range names {
 154         // wait until some concurrency-room is available
 155         permissions <- struct{}{}
 156         inputs.Add(1)
 157 
 158         go func(i int) {
 159             defer inputs.Done()
 160             res, err := handleInput(names[i])
 161             events <- event{i, res, err}
 162             <-permissions
 163         }(i)
 164     }
 165 
 166     // wait for all inputs, before closing the `events` channel
 167     inputs.Wait()
 168 }
 169 
 170 // handleInput handles each work-item for func handleInputs
 171 func handleInput(path string) (stats, error) {
 172     var res stats
 173     res.name = path
 174 
 175     if path == `-` {
 176         err := res.updateStats(os.Stdin)
 177         return res, err
 178     }
 179 
 180     f, err := os.Open(path)
 181     if err != nil {
 182         res.result = resultError
 183         // on windows, file-not-found error messages may mention `CreateFile`,
 184         // even when trying to open files in read-only mode
 185         return res, errors.New(`can't open file named ` + path)
 186     }
 187     defer f.Close()
 188 
 189     err = res.updateStats(f)
 190     return res, err
 191 }
 192 
 193 // handleOutput asynchronously updates output as results are known, whether
 194 // it's errors or successful results; returns whether it succeeded, which
 195 // means no errors happened
 196 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) {
 197     ok = true
 198     bw := bufio.NewWriter(w)
 199     defer bw.Flush()
 200 
 201     results := make([]stats, rescount)
 202 
 203     // keep track of which tasks are over, so that on each event all leading
 204     // results which are ready are shown: all of this ensures prompt output
 205     // updates as soon as results come in, while keeping the original order
 206     // of the names/filepaths given
 207     resultsLeft := results
 208 
 209     for v := range events {
 210         results[v.Index] = v.Stats
 211         if v.Err != nil {
 212             ok = false
 213             bw.Flush()
 214             showError(v.Err)
 215 
 216             // stay in the current loop, in case this failure was keeping
 217             // previous successes from showing up
 218         }
 219 
 220         n := countLeadingReady(resultsLeft)
 221 
 222         for _, res := range resultsLeft[:n] {
 223             if err := showResult(bw, res); err != nil {
 224                 // assume later stages/apps in a pipe had enough input and
 225                 // quit successfully, so quit successfully too
 226                 return true
 227             }
 228         }
 229         resultsLeft = resultsLeft[n:]
 230 
 231         // flush output-buffer only if anything new was shown
 232         if n > 0 {
 233             bw.Flush()
 234         }
 235     }
 236 
 237     return ok
 238 }
 239 
 240 // showError standardizes how errors from this app look
 241 func showError(err error) {
 242     os.Stderr.WriteString("\x1b[31m")
 243     os.Stderr.WriteString(err.Error())
 244     os.Stderr.WriteString("\x1b[0m\n")
 245 }
 246 
 247 // showResult does what it says
 248 func showResult(w *bufio.Writer, res stats) error {
 249     if res.result == resultError {
 250         return nil
 251     }
 252 
 253     var buf [64]byte
 254     w.WriteString(res.name)
 255     w.WriteByte('\t')
 256     w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10))
 257     w.WriteByte('\t')
 258     w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10))
 259     w.WriteByte('\t')
 260     w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10))
 261     w.WriteByte('\t')
 262     w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10))
 263     w.WriteByte('\t')
 264     w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10))
 265     w.WriteByte('\t')
 266     w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10))
 267     w.WriteByte('\t')
 268     w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10))
 269     w.WriteByte('\t')
 270     w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10))
 271     w.WriteByte('\t')
 272     w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10))
 273     w.WriteByte('\t')
 274     w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10))
 275     w.WriteByte('\t')
 276     w.WriteString(bomLegend[res.bom])
 277     return w.WriteByte('\n')
 278 }
 279 
 280 // unique ensures items only appear once in the result, keeping the original
 281 // slice unchanged
 282 func unique(src []string) []string {
 283     var unique []string
 284     got := make(map[string]struct{})
 285     for _, s := range src {
 286         if _, ok := got[s]; ok {
 287             continue
 288         }
 289         unique = append(unique, s)
 290         got[s] = struct{}{}
 291     }
 292     return unique
 293 }
 294 
 295 // findAllFiles does what it says, given a mix of file/folder paths, finding
 296 // all files recursively in the case of folders
 297 func findAllFiles(paths []string) (found []string, ok bool) {
 298     var unique []string
 299     got := make(map[string]struct{})
 300     ok = true
 301 
 302     for _, root := range paths {
 303         // a dash means standard input
 304         if root == `-` {
 305             if _, ok := got[root]; ok {
 306                 continue
 307             }
 308 
 309             unique = append(unique, root)
 310             got[root] = struct{}{}
 311             continue
 312         }
 313 
 314         _, err := os.Stat(root)
 315         if os.IsNotExist(err) {
 316             ok = false
 317             // on windows, file-not-found error messages may mention `CreateFile`,
 318             // even when trying to open files in read-only mode
 319             err := errors.New(`can't find file/folder named ` + root)
 320             showError(err)
 321             continue
 322         }
 323 
 324         err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
 325             if err != nil {
 326                 return err
 327             }
 328 
 329             if d.IsDir() {
 330                 return nil
 331             }
 332 
 333             if _, ok := got[path]; ok {
 334                 return nil
 335             }
 336 
 337             unique = append(unique, path)
 338             got[path] = struct{}{}
 339             return nil
 340         })
 341 
 342         if err != nil {
 343             ok = false
 344             showError(err)
 345         }
 346     }
 347 
 348     return unique, ok
 349 }
 350 
 351 // counter makes it easy to change the int-size of almost all counters
 352 type counter int
 353 
 354 // statResult constrains possible result-states/values in type stats
 355 type statResult int
 356 
 357 const (
 358     // resultPending is the default not-yet-ready result-status
 359     resultPending = statResult(0)
 360 
 361     // resultError means result should show as an error, instead of data
 362     resultError = statResult(1)
 363 
 364     // resultSuccess means result can be shown
 365     resultSuccess = statResult(2)
 366 )
 367 
 368 type bomType int
 369 
 370 const (
 371     noBOM      = bomType(0)
 372     utf8BOM    = bomType(1)
 373     utf16leBOM = bomType(2)
 374     utf16beBOM = bomType(3)
 375     utf32leBOM = bomType(4)
 376     utf32beBOM = bomType(5)
 377 )
 378 
 379 // bomLegend has the string-equivalents of the `bomType` constants
 380 var bomLegend = []string{
 381     ``,
 382     `UTF-8`,
 383     `UTF-16 LE`,
 384     `UTF-16 BE`,
 385     `UTF-32 LE`,
 386     `UTF-32 BE`,
 387 }
 388 
 389 // stats has all the size-stats for some input, as well as a way to
 390 // skip showing results, in case of an error such as `file not found`
 391 type stats struct {
 392     // bytes counts all bytes read
 393     bytes int
 394 
 395     // lines counts lines, and is 0 only when the byte-count is also 0
 396     lines counter
 397 
 398     // maxWidth is maximum byte-width of lines, excluding carriage-returns
 399     // and/or line-feeds
 400     maxWidth counter
 401 
 402     // nulls counts all-bits-off bytes
 403     nulls counter
 404 
 405     // fulls counts all-bits-on bytes
 406     fulls counter
 407 
 408     // highs counts bytes with their `top` (highest-order) bit on
 409     highs counter
 410 
 411     // spaces counts ASCII spaces
 412     spaces counter
 413 
 414     // tabs counts ASCII tabs
 415     tabs counter
 416 
 417     // trailing counts lines with trailing spaces in them
 418     trailing counter
 419 
 420     // lf counts ASCII line-feeds as their own byte-values: this means its
 421     // value will always be at least the same as field `crlf`
 422     lf counter
 423 
 424     // crlf counts ASCII CRLF byte-pairs
 425     crlf counter
 426 
 427     // the type of byte-order mark detected
 428     bom bomType
 429 
 430     // name is the filepath of the file/source these stats are about
 431     name string
 432 
 433     // results keeps track of whether results are valid and/or ready
 434     result statResult
 435 }
 436 
 437 // updateStats does what it says, reading everything from a reader
 438 func (res *stats) updateStats(r io.Reader) error {
 439     err := res.updateUsing(r)
 440     if err == io.EOF {
 441         err = nil
 442     }
 443 
 444     if err == nil {
 445         res.result = resultSuccess
 446     } else {
 447         res.result = resultError
 448     }
 449     return err
 450 }
 451 
 452 func checkBOM(data []byte) bomType {
 453     d := data
 454     l := len(data)
 455 
 456     if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf {
 457         return utf8BOM
 458     }
 459     if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 {
 460         return utf32leBOM
 461     }
 462     if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff {
 463         return utf32beBOM
 464     }
 465     if l >= 2 && data[0] == 0xff && data[1] == 0xfe {
 466         return utf16leBOM
 467     }
 468     if l >= 2 && data[0] == 0xfe && data[1] == 0xff {
 469         return utf16beBOM
 470     }
 471 
 472     return noBOM
 473 }
 474 
 475 // updateUsing helps func updateStats do its job
 476 func (res *stats) updateUsing(r io.Reader) error {
 477     var buf [32 * 1024]byte
 478     var tallies [256]uint64
 479 
 480     var width counter
 481     var prev1, prev2 byte
 482 
 483     for {
 484         n, err := r.Read(buf[:])
 485         if n < 1 {
 486             res.lines = counter(tallies['\n'])
 487             res.tabs = counter(tallies['\t'])
 488             res.spaces = counter(tallies[' '])
 489             res.lf = counter(tallies['\n'])
 490             res.nulls = counter(tallies[0])
 491             res.fulls = counter(tallies[255])
 492             for i := 128; i < 256; i++ {
 493                 res.highs += counter(tallies[i])
 494             }
 495 
 496             if err == io.EOF {
 497                 return res.handleEnd(width, prev1, prev2)
 498             }
 499             return err
 500         }
 501 
 502         chunk := buf[:n]
 503         if res.bytes == 0 {
 504             res.bom = checkBOM(chunk)
 505         }
 506         res.bytes += n
 507 
 508         for _, b := range chunk {
 509             // count values without branching, because it's fun
 510             tallies[b]++
 511 
 512             // handle line-feeds
 513             if b == '\n' {
 514                 crlf := count(prev1, '\r')
 515                 res.crlf += crlf
 516 
 517                 // count lines with trailing spaces, whether these end with
 518                 // a CRLF byte-pair or just a line-feed byte
 519                 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 520                     res.trailing++
 521                 }
 522 
 523                 // exclude any CR from the current line's width-count
 524                 width -= crlf
 525                 if res.maxWidth < width {
 526                     res.maxWidth = width
 527                 }
 528 
 529                 prev2 = prev1
 530                 prev1 = b
 531                 width = 0
 532                 continue
 533             }
 534 
 535             prev2 = prev1
 536             prev1 = b
 537             width++
 538         }
 539     }
 540 }
 541 
 542 // handleEnd fixes/finalizes stats when input data end; this func is only
 543 // meant to be used by func updateStats, since it takes some of the latter's
 544 // local variables
 545 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 546     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 547         res.trailing++
 548     }
 549 
 550     if res.maxWidth < width {
 551         res.maxWidth = width
 552     }
 553 
 554     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 555     // standard cmd-line tool `wc`
 556     if res.bytes > 0 && prev1 != '\n' {
 557         res.lines++
 558     }
 559 
 560     return nil
 561 }
 562 
 563 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 564 // be added directly/branchlessly to totals
 565 func count(x, y byte) counter {
 566     if x != y {
 567         return 0
 568     }
 569     return 1
 570 }
 571 
 572 // countLeadingReady finds how many items are ready to show at the start of a
 573 // results-slice, which ensures output matches the original item-order
 574 func countLeadingReady(values []stats) int {
 575     for i, v := range values {
 576         if v.result == resultPending {
 577             return i
 578         }
 579     }
 580     return len(values)
 581 }