File: coby.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2024 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 Single-file source-code for coby: this version has no http(s) support. Even
  27 the unit-tests from the original coby are omitted.
  28 
  29 To compile a smaller-sized command-line app, you can use the `go` command as
  30 follows:
  31 
  32 go build -ldflags "-s -w" -trimpath coby.go
  33 */
  34 
  35 package main
  36 
  37 import (
  38     "bufio"
  39     "errors"
  40     "io"
  41     "io/fs"
  42     "os"
  43     "path/filepath"
  44     "runtime"
  45     "strconv"
  46     "sync"
  47 )
  48 
  49 const info = `
  50 coby [files/folders...]
  51 
  52 
  53 COunt BYtes finds out some simple byte-related stats, counting
  54 
  55     - bytes
  56     - lines
  57     - how many lines have trailing spaces
  58     - how many lines end with a CRLF pair
  59     - all-off (0) bytes
  60     - all-on (255) bytes
  61     - high-bytes (128+)
  62     - which (if any) byte-order mark the data start with
  63 
  64 The output is TSV (tab-separated values) lines, where the first line has
  65 all the column names.
  66 
  67 When no filepaths are given, the standard input is used by default. All
  68 folder names given expand recursively into all filenames in them.
  69 `
  70 
  71 // header is the first output line
  72 var header = []string{
  73     `name`,
  74     `bytes`,
  75     `runes`,
  76     `lines`,
  77     `lf`,
  78     `crlf`,
  79     `spaces`,
  80     `tabs`,
  81     `trails`,
  82     `nulls`,
  83     `fulls`,
  84     `highs`,
  85     `bom`,
  86 }
  87 
  88 // event has what the output-reporting task needs to show the results of a
  89 // task which has just completed, perhaps unsuccessfully
  90 type event struct {
  91     // Index points to the task's entry in the results-slice
  92     Index int
  93 
  94     // Stats has all the byte-related stats
  95     Stats stats
  96 
  97     // Err is the completed task's error, or lack of
  98     Err error
  99 }
 100 
 101 func main() {
 102     if len(os.Args) > 1 {
 103         switch os.Args[1] {
 104         case `-h`, `--h`, `-help`, `--help`:
 105             os.Stderr.WriteString(info[1:])
 106             return
 107         }
 108     }
 109 
 110     // show first/heading line right away, to let users know things are
 111     // happening
 112     for i, s := range header {
 113         if i > 0 {
 114             os.Stdout.WriteString("\t")
 115         }
 116         os.Stdout.WriteString(s)
 117     }
 118     // assume an error means later stages/apps in a pipe had enough input and
 119     // quit successfully, so quit successfully too
 120     _, err := os.Stdout.WriteString("\n")
 121     if err != nil {
 122         return
 123     }
 124 
 125     // names has all filepaths given, ignoring repetitions
 126     names, ok := findAllFiles(unique(os.Args[1:]))
 127     if !ok {
 128         os.Exit(1)
 129     }
 130     if len(names) == 0 {
 131         names = []string{`-`}
 132     }
 133 
 134     events := make(chan event)
 135     go handleInputs(names, events)
 136     if !handleOutput(os.Stdout, len(names), events) {
 137         os.Exit(1)
 138     }
 139 }
 140 
 141 // handleInputs launches all the tasks which do the actual work, limiting how
 142 // many inputs are being worked on at the same time
 143 func handleInputs(names []string, events chan event) {
 144     // allow output-reporter task to end, and thus the app
 145     defer close(events)
 146 
 147     // permissions limits how many worker tasks can be active at the same
 148     // time: when given many filepaths to work on, rate-limiting avoids
 149     // a massive number of concurrent tasks which read and process input
 150     permissions := make(chan struct{}, runtime.NumCPU())
 151     defer close(permissions)
 152 
 153     var inputs sync.WaitGroup
 154     for i := range names {
 155         // wait until some concurrency-room is available
 156         permissions <- struct{}{}
 157         inputs.Add(1)
 158 
 159         go func(i int) {
 160             defer inputs.Done()
 161             res, err := handleInput(names[i])
 162             events <- event{i, res, err}
 163             <-permissions
 164         }(i)
 165     }
 166 
 167     // wait for all inputs, before closing the `events` channel
 168     inputs.Wait()
 169 }
 170 
 171 // handleInput handles each work-item for func handleInputs
 172 func handleInput(path string) (stats, error) {
 173     var res stats
 174     res.name = path
 175 
 176     if path == `-` {
 177         err := res.updateStats(os.Stdin)
 178         return res, err
 179     }
 180 
 181     f, err := os.Open(path)
 182     if err != nil {
 183         res.result = resultError
 184         // on windows, file-not-found error messages may mention `CreateFile`,
 185         // even when trying to open files in read-only mode
 186         return res, errors.New(`can't open file named ` + path)
 187     }
 188     defer f.Close()
 189 
 190     err = res.updateStats(f)
 191     return res, err
 192 }
 193 
 194 // handleOutput asynchronously updates output as results are known, whether
 195 // it's errors or successful results; returns whether it succeeded, which
 196 // means no errors happened
 197 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) {
 198     ok = true
 199     bw := bufio.NewWriter(w)
 200     defer bw.Flush()
 201 
 202     results := make([]stats, rescount)
 203 
 204     // keep track of which tasks are over, so that on each event all leading
 205     // results which are ready are shown: all of this ensures prompt output
 206     // updates as soon as results come in, while keeping the original order
 207     // of the names/filepaths given
 208     resultsLeft := results
 209 
 210     for v := range events {
 211         results[v.Index] = v.Stats
 212         if v.Err != nil {
 213             ok = false
 214             bw.Flush()
 215             showError(v.Err)
 216 
 217             // stay in the current loop, in case this failure was keeping
 218             // previous successes from showing up
 219         }
 220 
 221         n := countLeadingReady(resultsLeft)
 222 
 223         for _, res := range resultsLeft[:n] {
 224             if err := showResult(bw, res); err != nil {
 225                 // assume later stages/apps in a pipe had enough input and
 226                 // quit successfully, so quit successfully too
 227                 return true
 228             }
 229         }
 230         resultsLeft = resultsLeft[n:]
 231 
 232         // flush output-buffer only if anything new was shown
 233         if n > 0 {
 234             bw.Flush()
 235         }
 236     }
 237 
 238     return ok
 239 }
 240 
 241 // showError standardizes how errors from this app look
 242 func showError(err error) {
 243     os.Stderr.WriteString("\x1b[31m")
 244     os.Stderr.WriteString(err.Error())
 245     os.Stderr.WriteString("\x1b[0m\n")
 246 }
 247 
 248 // showResult does what it says
 249 func showResult(w *bufio.Writer, res stats) error {
 250     if res.result == resultError {
 251         return nil
 252     }
 253 
 254     var buf [64]byte
 255     w.WriteString(res.name)
 256     w.Write([]byte{'\t'})
 257     w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10))
 258     w.Write([]byte{'\t'})
 259     w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10))
 260     w.Write([]byte{'\t'})
 261     w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10))
 262     w.Write([]byte{'\t'})
 263     w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10))
 264     w.Write([]byte{'\t'})
 265     w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10))
 266     w.Write([]byte{'\t'})
 267     w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10))
 268     w.Write([]byte{'\t'})
 269     w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10))
 270     w.Write([]byte{'\t'})
 271     w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10))
 272     w.Write([]byte{'\t'})
 273     w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10))
 274     w.Write([]byte{'\t'})
 275     w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10))
 276     w.Write([]byte{'\t'})
 277     w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10))
 278     w.Write([]byte{'\t'})
 279     w.WriteString(bomLegend[res.bom])
 280     _, err := w.Write([]byte{'\n'})
 281     return err
 282 }
 283 
 284 // unique ensures items only appear once in the result, keeping the original
 285 // slice unchanged
 286 func unique(src []string) []string {
 287     var unique []string
 288     got := make(map[string]struct{})
 289     for _, s := range src {
 290         if _, ok := got[s]; ok {
 291             continue
 292         }
 293         unique = append(unique, s)
 294         got[s] = struct{}{}
 295     }
 296     return unique
 297 }
 298 
 299 // findAllFiles does what it says, given a mix of file/folder paths, finding
 300 // all files recursively in the case of folders
 301 func findAllFiles(paths []string) (found []string, ok bool) {
 302     var unique []string
 303     got := make(map[string]struct{})
 304     ok = true
 305 
 306     for _, root := range paths {
 307         // a dash means standard input
 308         if root == `-` {
 309             if _, ok := got[root]; ok {
 310                 continue
 311             }
 312 
 313             unique = append(unique, root)
 314             got[root] = struct{}{}
 315             continue
 316         }
 317 
 318         _, err := os.Stat(root)
 319         if os.IsNotExist(err) {
 320             ok = false
 321             // on windows, file-not-found error messages may mention `CreateFile`,
 322             // even when trying to open files in read-only mode
 323             err := errors.New(`can't find file/folder named ` + root)
 324             showError(err)
 325             continue
 326         }
 327 
 328         err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
 329             if err != nil {
 330                 return err
 331             }
 332 
 333             if d.IsDir() {
 334                 return nil
 335             }
 336 
 337             if _, ok := got[path]; ok {
 338                 return nil
 339             }
 340 
 341             unique = append(unique, path)
 342             got[path] = struct{}{}
 343             return nil
 344         })
 345 
 346         if err != nil {
 347             ok = false
 348             showError(err)
 349         }
 350     }
 351 
 352     return unique, ok
 353 }
 354 
 355 // isZero enables branchless-counting, when xor-compared bytes are used
 356 // as indices for it
 357 var isZero = [256]byte{1}
 358 
 359 // counter makes it easy to change the int-size of almost all counters
 360 type counter int
 361 
 362 // statResult constrains possible result-states/values in type stats
 363 type statResult int
 364 
 365 const (
 366     // resultPending is the default not-yet-ready result-status
 367     resultPending = statResult(0)
 368 
 369     // resultError signals result should show as an error, instead of data
 370     resultError = statResult(1)
 371 
 372     // resultSuccess means result can be shown
 373     resultSuccess = statResult(2)
 374 )
 375 
 376 type bomType int
 377 
 378 const (
 379     noBOM      = bomType(0)
 380     utf8BOM    = bomType(1)
 381     utf16leBOM = bomType(2)
 382     utf16beBOM = bomType(3)
 383     utf32leBOM = bomType(4)
 384     utf32beBOM = bomType(5)
 385 )
 386 
 387 var bomLegend = []string{
 388     ``,
 389     `UTF-8`,
 390     `UTF-16 LE`,
 391     `UTF-16 BE`,
 392     `UTF-32 LE`,
 393     `UTF-32 BE`,
 394 }
 395 
 396 // stats has all the size-stats for some input, as well as a way to
 397 // skip showing results, in case of an error such as `file not found`
 398 type stats struct {
 399     // bytes counts all bytes read
 400     bytes int
 401 
 402     // lines counts lines, and is 0 only when the byte-count is also 0
 403     lines counter
 404 
 405     // runes counts utf-8 sequences, each of which can use up to 4 bytes and
 406     // is usually a complete symbol: `emoji` country-flags are commonly-used
 407     // counter-examples, as these `symbols` need 2 runes, using 8 bytes each
 408     runes counter
 409 
 410     // maxWidth is maximum byte-width of lines, excluding carriage-returns
 411     // and/or line-feeds
 412     maxWidth counter
 413 
 414     // nulls counts all-bits-off bytes
 415     nulls counter
 416 
 417     // fulls counts all-bits-on bytes
 418     fulls counter
 419 
 420     // highs counts bytes with their `top` (highest-order) bit on
 421     highs counter
 422 
 423     // spaces counts ASCII spaces
 424     spaces counter
 425 
 426     // tabs counts ASCII tabs
 427     tabs counter
 428 
 429     // trailing counts lines with trailing spaces in them
 430     trailing counter
 431 
 432     // lf counts ASCII line-feeds as their own byte-values: this means its
 433     // value will always be at least the same as field `crlf`
 434     lf counter
 435 
 436     // crlf counts ASCII CRLF byte-pairs
 437     crlf counter
 438 
 439     // the type of byte-order mark detected
 440     bom bomType
 441 
 442     // name is the filepath of the file/source these stats are about
 443     name string
 444 
 445     // results keeps track of whether results are valid and/or ready
 446     result statResult
 447 }
 448 
 449 // updateStats does what it says, reading everything from a reader
 450 func (res *stats) updateStats(r io.Reader) error {
 451     err := res.updateUsing(r)
 452     if err == io.EOF {
 453         err = nil
 454     }
 455 
 456     if err == nil {
 457         res.result = resultSuccess
 458     } else {
 459         res.result = resultError
 460     }
 461     return err
 462 }
 463 
 464 func checkBOM(data []byte) bomType {
 465     d := data
 466     l := len(data)
 467 
 468     if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf {
 469         return utf8BOM
 470     }
 471     if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 {
 472         return utf32leBOM
 473     }
 474     if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff {
 475         return utf32beBOM
 476     }
 477     if l >= 2 && data[0] == 0xff && data[1] == 0xfe {
 478         return utf16leBOM
 479     }
 480     if l >= 2 && data[0] == 0xfe && data[1] == 0xff {
 481         return utf16beBOM
 482     }
 483 
 484     return noBOM
 485 }
 486 
 487 // updateUsing helps func updateStats do its job
 488 func (res *stats) updateUsing(r io.Reader) error {
 489     var width counter
 490     var prev1, prev2 byte
 491     var buf [16 * 1024]byte
 492     var tallies [256]uint64
 493 
 494     for {
 495         n, err := r.Read(buf[:])
 496         if n < 1 {
 497             if err == io.EOF {
 498                 res.lines = counter(tallies['\n'])
 499                 res.tabs = counter(tallies['\t'])
 500                 res.spaces = counter(tallies[' '])
 501                 res.lf = counter(tallies['\n'])
 502                 res.nulls = counter(tallies[0])
 503                 res.fulls = counter(tallies[255])
 504                 for i := 128; i < 256; i++ {
 505                     res.highs += counter(tallies[i])
 506                 }
 507                 return res.handleEnd(width, prev1, prev2)
 508             }
 509             return err
 510         }
 511 
 512         chunk := buf[:n]
 513         if res.bytes == 0 {
 514             res.bom = checkBOM(chunk)
 515         }
 516         res.bytes += n
 517 
 518         for _, b := range chunk {
 519             // count values without branching, because it's fun
 520             tallies[b]++
 521 
 522             // handle non-ASCII runes, assuming input is valid UTF-8
 523             res.runes += 1 - count(b&0xc0, 0x80)
 524 
 525             // handle line-feeds
 526             if b == '\n' {
 527                 crlf := count(prev1, '\r')
 528                 res.crlf += crlf
 529 
 530                 // count lines with trailing spaces, whether these end with
 531                 // a CRLF byte-pair or just a line-feed byte
 532                 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 533                     res.trailing++
 534                 }
 535 
 536                 // exclude any CR from the current line's width-count
 537                 width -= crlf
 538                 if res.maxWidth < width {
 539                     res.maxWidth = width
 540                 }
 541 
 542                 prev2 = prev1
 543                 prev1 = b
 544                 width = 0
 545                 continue
 546             }
 547 
 548             prev2 = prev1
 549             prev1 = b
 550             width++
 551         }
 552     }
 553 }
 554 
 555 // handleEnd fixes/finalizes stats when input data end; this func is only
 556 // meant to be used by func updateStats, since it takes some of the latter's
 557 // local variables
 558 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 559     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 560         res.trailing++
 561     }
 562 
 563     if res.maxWidth < width {
 564         res.maxWidth = width
 565     }
 566 
 567     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 568     // standard cmd-line tool `wc`
 569     if res.bytes > 0 && prev1 != '\n' {
 570         res.lines++
 571     }
 572 
 573     return nil
 574 }
 575 
 576 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 577 // be added directly/branchlessly to totals
 578 // func count(x, y byte) counter {
 579 //  return counter(isZero[x^y])
 580 // }
 581 
 582 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 583 // be added directly/branchlessly to totals
 584 func count(x, y byte) counter {
 585     if x != y {
 586         return 0
 587     }
 588     return 1
 589 }
 590 
 591 // countLeadingReady finds how many items are ready to show at the start of a
 592 // results-slice, which ensures output matches the original item-order
 593 func countLeadingReady(values []stats) int {
 594     for i, v := range values {
 595         if v.result == resultPending {
 596             return i
 597         }
 598     }
 599     return len(values)
 600 }