File: coby/info.txt
   1 coby [files/folders...]
   2 
   3 COunt BYtes finds out some simple byte-related stats, counting
   4 
   5     - bytes
   6     - lines
   7     - how many lines have trailing spaces
   8     - how many lines end with a CRLF pair
   9     - all-off (0) bytes
  10     - all-on (255) bytes
  11     - high-bytes (128+)
  12 
  13 The output is TSV (tab-separated values) lines, where the first line has
  14 all the column names.
  15 
  16 When no filepaths are given, the standard input is used by default. All
  17 folder names given expand recursively into all filenames in them.

     File: coby/main.go
   1 package main
   2 
   3 import (
   4     "bufio"
   5     "errors"
   6     "io"
   7     "io/fs"
   8     "os"
   9     "path/filepath"
  10     "runtime"
  11     "strconv"
  12     "sync"
  13 
  14     _ "embed"
  15 )
  16 
  17 // Note: the code is avoiding using the fmt package to save hundreds of
  18 // kilobytes on the resulting executable, which is a noticeable difference.
  19 
  20 //go:embed info.txt
  21 var info string
  22 
  23 // header is the first output line
  24 var header = []string{
  25     `name`,
  26     `bytes`,
  27     `runes`,
  28     `lines`,
  29     `lf`,
  30     `crlf`,
  31     `spaces`,
  32     `tabs`,
  33     `trails`,
  34     `nulls`,
  35     `fulls`,
  36     `highs`,
  37 }
  38 
  39 // event has what the output-reporting task needs to show the results of a
  40 // task which has just completed, perhaps unsuccessfully
  41 type event struct {
  42     // Index points to the task's entry in the results-slice
  43     Index int
  44 
  45     // Stats has all the byte-related stats
  46     Stats stats
  47 
  48     // Err is the completed task's error, or lack of
  49     Err error
  50 }
  51 
  52 func main() {
  53     if len(os.Args) > 1 {
  54         switch os.Args[1] {
  55         case `-h`, `--h`, `-help`, `--help`:
  56             os.Stderr.WriteString(info)
  57             return
  58         }
  59     }
  60 
  61     // show first/heading line right away, to let users know things are
  62     // happening
  63     for i, s := range header {
  64         if i > 0 {
  65             os.Stdout.WriteString("\t")
  66         }
  67         os.Stdout.WriteString(s)
  68     }
  69     // assume an error means later stages/apps in a pipe had enough input and
  70     // quit successfully, so quit successfully too
  71     _, err := os.Stdout.WriteString("\n")
  72     if err != nil {
  73         return
  74     }
  75 
  76     // names has all filepaths given, ignoring repetitions
  77     names, ok := findAllFiles(unique(os.Args[1:]))
  78     if !ok {
  79         os.Exit(1)
  80     }
  81     if len(names) == 0 {
  82         names = []string{`-`}
  83     }
  84 
  85     events := make(chan event)
  86     go handleInputs(names, events)
  87     if !handleOutput(os.Stdout, len(names), events) {
  88         os.Exit(1)
  89     }
  90 }
  91 
  92 // handleInputs launches all the tasks which do the actual work, limiting how
  93 // many inputs are being worked on at the same time
  94 func handleInputs(names []string, events chan event) {
  95     // allow output-reporter task to end, and thus the app
  96     defer close(events)
  97 
  98     // permissions limits how many worker tasks can be active at the same
  99     // time: when given many filepaths to work on, rate-limiting avoids
 100     // a massive number of concurrent tasks which read and process input
 101     permissions := make(chan struct{}, runtime.NumCPU())
 102     defer close(permissions)
 103 
 104     var inputs sync.WaitGroup
 105     for i := range names {
 106         // wait until some concurrency-room is available
 107         permissions <- struct{}{}
 108         inputs.Add(1)
 109 
 110         go func(i int) {
 111             defer inputs.Done()
 112             res, err := handleInput(names[i])
 113             events <- event{i, res, err}
 114             <-permissions
 115         }(i)
 116     }
 117 
 118     // wait for all inputs, before closing the `events` channel
 119     inputs.Wait()
 120 }
 121 
 122 // handleInput handles each work-item for func handleInputs
 123 func handleInput(path string) (stats, error) {
 124     var res stats
 125     res.name = path
 126 
 127     if path == `-` {
 128         err := res.updateStats(os.Stdin)
 129         return res, err
 130     }
 131 
 132     f, err := os.Open(path)
 133     if err != nil {
 134         res.result = resultError
 135         // on windows, file-not-found error messages may mention `CreateFile`,
 136         // even when trying to open files in read-only mode
 137         return res, errors.New(`can't open file named ` + path)
 138     }
 139     defer f.Close()
 140 
 141     err = res.updateStats(f)
 142     return res, err
 143 }
 144 
 145 // handleOutput asynchronously updates output as results are known, whether
 146 // it's errors or successful results; returns whether it succeeded, which
 147 // means no errors happened
 148 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) {
 149     bw := bufio.NewWriter(w)
 150     defer bw.Flush()
 151 
 152     results := make([]stats, rescount)
 153 
 154     // keep track of which tasks are over, so that on each event all leading
 155     // results which are ready are shown: all of this ensures prompt output
 156     // updates as soon as results come in, while keeping the original order
 157     // of the names/filepaths given
 158     resultsLeft := results
 159 
 160     for v := range events {
 161         results[v.Index] = v.Stats
 162         if v.Err != nil {
 163             ok = false
 164             bw.Flush()
 165             showError(v.Err)
 166 
 167             // stay in the current loop, in case this failure was keeping
 168             // previous successes from showing up
 169         }
 170 
 171         n := countLeadingReady(resultsLeft)
 172 
 173         for _, res := range resultsLeft[:n] {
 174             if err := showResult(bw, res); err != nil {
 175                 // assume later stages/apps in a pipe had enough input and
 176                 // quit successfully, so quit successfully too
 177                 return true
 178             }
 179         }
 180         resultsLeft = resultsLeft[n:]
 181 
 182         // flush output-buffer only if anything new was shown
 183         if n > 0 {
 184             bw.Flush()
 185         }
 186     }
 187 
 188     return ok
 189 }
 190 
 191 // showError standardizes how errors from this app look
 192 func showError(err error) {
 193     os.Stderr.WriteString("\x1b[31m")
 194     os.Stderr.WriteString(err.Error())
 195     os.Stderr.WriteString("\x1b[0m\n")
 196 }
 197 
 198 // showResult does what it says
 199 func showResult(w *bufio.Writer, res stats) error {
 200     if res.result == resultError {
 201         return nil
 202     }
 203 
 204     var buf [64]byte
 205     w.WriteString(res.name)
 206     w.Write([]byte{'\t'})
 207     w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10))
 208     w.Write([]byte{'\t'})
 209     w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10))
 210     w.Write([]byte{'\t'})
 211     w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10))
 212     w.Write([]byte{'\t'})
 213     w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10))
 214     w.Write([]byte{'\t'})
 215     w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10))
 216     w.Write([]byte{'\t'})
 217     w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10))
 218     w.Write([]byte{'\t'})
 219     w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10))
 220     w.Write([]byte{'\t'})
 221     w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10))
 222     w.Write([]byte{'\t'})
 223     w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10))
 224     w.Write([]byte{'\t'})
 225     w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10))
 226     w.Write([]byte{'\t'})
 227     w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10))
 228     _, err := w.Write([]byte{'\n'})
 229     return err
 230 }
 231 
 232 // unique ensures items only appear once in the result, keeping the original
 233 // slice unchanged
 234 func unique(src []string) []string {
 235     var unique []string
 236     got := make(map[string]struct{})
 237     for _, s := range src {
 238         if _, ok := got[s]; ok {
 239             continue
 240         }
 241         unique = append(unique, s)
 242         got[s] = struct{}{}
 243     }
 244     return unique
 245 }
 246 
 247 // findAllFiles does what it says, given a mix of file/folder paths, finding
 248 // all files recursively in the case of folders
 249 func findAllFiles(paths []string) (found []string, ok bool) {
 250     var unique []string
 251     got := make(map[string]struct{})
 252     ok = true
 253 
 254     for _, root := range paths {
 255         // a dash means standard input
 256         if root == `-` {
 257             if _, ok := got[root]; ok {
 258                 continue
 259             }
 260 
 261             unique = append(unique, root)
 262             got[root] = struct{}{}
 263             continue
 264         }
 265 
 266         _, err := os.Stat(root)
 267         if os.IsNotExist(err) {
 268             ok = false
 269             // on windows, file-not-found error messages may mention `CreateFile`,
 270             // even when trying to open files in read-only mode
 271             err := errors.New(`can't find file/folder named ` + root)
 272             showError(err)
 273             continue
 274         }
 275 
 276         err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
 277             if err != nil {
 278                 return err
 279             }
 280 
 281             if d.IsDir() {
 282                 return nil
 283             }
 284 
 285             if _, ok := got[path]; ok {
 286                 return nil
 287             }
 288 
 289             unique = append(unique, path)
 290             got[path] = struct{}{}
 291             return nil
 292         })
 293 
 294         if err != nil {
 295             ok = false
 296             showError(err)
 297         }
 298     }
 299 
 300     return unique, ok
 301 }

     File: coby/mit-license.txt
   1 The MIT License (MIT)
   2 
   3 Copyright © 2024 pacman64
   4 
   5 Permission is hereby granted, free of charge, to any person obtaining a copy of
   6 this software and associated documentation files (the “Software”), to deal
   7 in the Software without restriction, including without limitation the rights to
   8 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9 of the Software, and to permit persons to whom the Software is furnished to do
  10 so, subject to the following conditions:
  11 
  12 The above copyright notice and this permission notice shall be included in all
  13 copies or substantial portions of the Software.
  14 
  15 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21 SOFTWARE.

     File: coby/stats.go
   1 package main
   2 
   3 import (
   4     "io"
   5 )
   6 
   7 // isZero enables branchless-counting, when xor-compared bytes are used
   8 // as indices for it
   9 var isZero = [256]byte{1}
  10 
  11 // counter makes it easy to change the int-size of almost all counters
  12 type counter int
  13 
  14 // statResult constrains possible result-states/values in type stats
  15 type statResult int
  16 
  17 const (
  18     // resultPending is the default not-yet-ready result-status
  19     resultPending = statResult(0)
  20 
  21     // resultError signals result should show as an error, instead of data
  22     resultError = statResult(1)
  23 
  24     // resultSuccess means result can be shown
  25     resultSuccess = statResult(2)
  26 )
  27 
  28 // stats has all the size-stats for some input, as well as a way to
  29 // skip showing results, in case of an error such as `file not found`
  30 type stats struct {
  31     // bytes counts all bytes read
  32     bytes int
  33 
  34     // lines counts lines, and is 0 only when the byte-count is also 0
  35     lines counter
  36 
  37     // runes counts utf-8 sequences, each of which can use up to 4 bytes and
  38     // is usually a complete symbol: `emoji` country-flags are commonly-used
  39     // counter-examples, as these `symbols` need 2 runes, using 8 bytes each
  40     runes counter
  41 
  42     // maxWidth is maximum byte-width of lines, excluding carriage-returns
  43     // and/or line-feeds
  44     maxWidth counter
  45 
  46     // nulls counts all-bits-off bytes
  47     nulls counter
  48 
  49     // fulls counts all-bits-on bytes
  50     fulls counter
  51 
  52     // highs counts bytes with their `top` (highest-order) bit on
  53     highs counter
  54 
  55     // spaces counts ASCII spaces
  56     spaces counter
  57 
  58     // tabs counts ASCII tabs
  59     tabs counter
  60 
  61     // trailing counts lines with trailing spaces in them
  62     trailing counter
  63 
  64     // lf counts ASCII line-feeds as their own byte-values: this means its
  65     // value will always be at least the same as field `crlf`
  66     lf counter
  67 
  68     // crlf counts ASCII CRLF byte-pairs
  69     crlf counter
  70 
  71     // name is the filepath of the file/source these stats are about
  72     name string
  73 
  74     // results keeps track of whether results are valid and/or ready
  75     result statResult
  76 }
  77 
  78 // updateStats does what it says, reading everything from a reader
  79 func (res *stats) updateStats(r io.Reader) error {
  80     err := res.updateUsing(r)
  81     if err == io.EOF {
  82         err = nil
  83     }
  84 
  85     if err == nil {
  86         res.result = resultSuccess
  87     } else {
  88         res.result = resultError
  89     }
  90     return err
  91 }
  92 
  93 // updateUsing helps func updateStats do its job
  94 func (res *stats) updateUsing(r io.Reader) error {
  95     var width counter
  96     var highRun int
  97     var prev1, prev2 byte
  98     var buf [16 * 1024]byte
  99     var tallies [256]uint64
 100 
 101     for {
 102         n, err := r.Read(buf[:])
 103         if n < 1 {
 104             if err == io.EOF {
 105                 res.tabs = counter(tallies['\t'])
 106                 res.spaces = counter(tallies[' '])
 107                 res.lf = counter(tallies['\n'])
 108                 res.nulls = counter(tallies[0])
 109                 res.fulls = counter(tallies[255])
 110                 for i := 128; i < 256; i++ {
 111                     res.highs += counter(tallies[i])
 112                 }
 113                 return res.handleEnd(width, prev1, highRun)
 114             }
 115             return err
 116         }
 117 
 118         res.bytes += n
 119         chunk := buf[:n]
 120 
 121         for _, b := range chunk {
 122             // count values without branching, because it's fun
 123             tallies[b]++
 124 
 125             // handle non-ASCII runes, assuming input is valid UTF-8
 126             if b >= 128 {
 127                 if highRun < 3 {
 128                     highRun++
 129                 } else {
 130                     highRun = 0
 131                     res.runes++
 132                     width++
 133                 }
 134 
 135                 prev2 = prev1
 136                 prev1 = b
 137                 continue
 138             }
 139 
 140             // handle line-feeds
 141             if b == '\n' {
 142                 res.lines++
 143 
 144                 crlf := count(prev1, '\r')
 145                 res.crlf += crlf
 146 
 147                 // count lines with trailing spaces, whether these end with
 148                 // a CRLF byte-pair or just a line-feed byte
 149                 res.trailing += count(prev1, ' ')
 150                 res.trailing += crlf & count(prev2, ' ')
 151 
 152                 // exclude any CR from the current line's width-count
 153                 width -= crlf
 154                 if res.maxWidth < width {
 155                     res.maxWidth = width
 156                 }
 157 
 158                 prev2 = prev1
 159                 prev1 = b
 160 
 161                 res.runes++
 162                 highRun = 0
 163                 width = 0
 164                 continue
 165             }
 166 
 167             prev2 = prev1
 168             prev1 = b
 169 
 170             res.runes++
 171             highRun = 0
 172             width++
 173         }
 174     }
 175 }
 176 
 177 // handleEnd fixes/finalizes stats when input data end; this func is only
 178 // meant to be used by func updateStats, since it takes some of the latter's
 179 // local variables
 180 func (res *stats) handleEnd(width counter, prev1 byte, highRun int) error {
 181     if prev1 == ' ' {
 182         res.trailing++
 183     }
 184 
 185     if res.maxWidth < width {
 186         res.maxWidth = width
 187     }
 188 
 189     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 190     // standard cmd-line tool `wc`
 191     if res.bytes > 0 && prev1 != '\n' {
 192         res.lines++
 193     }
 194 
 195     if highRun > 0 {
 196         res.runes++
 197     }
 198     return nil
 199 }
 200 
 201 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 202 // be added directly/branchlessly to totals
 203 func count(x, y byte) counter {
 204     return counter(isZero[x^y])
 205 }
 206 
 207 // countLeadingReady finds how many items are ready to show at the start of a
 208 // results-slice, which ensures output matches the original item-order
 209 func countLeadingReady(values []stats) int {
 210     for i, v := range values {
 211         if v.result == resultPending {
 212             return i
 213         }
 214     }
 215     return len(values)
 216 }

     File: coby/stats_test.go
   1 package main
   2 
   3 import (
   4     "strings"
   5     "testing"
   6 )
   7 
   8 func TestCount(t *testing.T) {
   9     for x := 0; x < 256; x++ {
  10         for y := 0; y < 256; y++ {
  11             var exp counter
  12             if x == y {
  13                 exp = 1
  14             }
  15 
  16             if got := count(byte(x), byte(y)); got != exp {
  17                 t.Fatalf(`%d, %d: expected %v, but got %v`, x, y, exp, got)
  18                 return
  19             }
  20         }
  21     }
  22 }
  23 
  24 func TestCountLeadingReady(t *testing.T) {
  25     for size := 0; size <= 20; size++ {
  26         for exp := 0; exp < size; exp++ {
  27             values := make([]stats, size)
  28             for i := 0; i < exp; i++ {
  29                 v := resultSuccess
  30                 if i%2 == 1 {
  31                     v = resultError
  32                 }
  33                 values[i].result = v
  34             }
  35 
  36             if got := countLeadingReady(values); got != exp {
  37                 const fs = `size %d: expected %d, instead of %d`
  38                 t.Fatalf(fs, size, exp, got)
  39             }
  40         }
  41     }
  42 }
  43 
  44 func TestStats(t *testing.T) {
  45     var tests = []struct {
  46         Input    string
  47         Expected stats
  48     }{
  49         {
  50             ``,
  51             stats{},
  52         },
  53         {
  54             `abc`,
  55             stats{lines: 1, runes: 3, maxWidth: 3},
  56         },
  57         {
  58             "abc\tdef\r\n",
  59             stats{lines: 1, runes: 9, maxWidth: 7, tabs: 1, lf: 1, crlf: 1},
  60         },
  61         {
  62             "abc\tdef\r\n",
  63             stats{lines: 1, runes: 9, maxWidth: 7, tabs: 1, lf: 1, crlf: 1},
  64         },
  65         {
  66             "abc\tdef \r\n123\t456  789 ",
  67             stats{
  68                 lines: 2, runes: 23, maxWidth: 13,
  69                 spaces: 4, tabs: 2, trailing: 2, lf: 1, crlf: 1,
  70             },
  71         },
  72     }
  73 
  74     for _, tc := range tests {
  75         t.Run(tc.Input, func(t *testing.T) {
  76             var got stats
  77             err := got.updateStats(strings.NewReader(tc.Input))
  78             if err != nil {
  79                 t.Error(err)
  80                 return
  81             }
  82 
  83             tc.Expected.bytes = len(tc.Input)
  84             tc.Expected.result = resultSuccess
  85             if got != tc.Expected {
  86                 t.Fatalf("expected\n%#v,\ngot\n%#v", tc.Expected, got)
  87                 return
  88             }
  89         })
  90     }
  91 }