File: coby/go.mod
   1 module coby
   2 
   3 go 1.18

     File: coby/info.txt
   1 coby [files/folders...]
   2 
   3 COunt BYtes finds out some simple byte-related stats, counting
   4 
   5     - bytes
   6     - lines
   7     - how many lines have trailing spaces
   8     - how many lines end with a CRLF pair
   9     - all-off (0) bytes
  10     - all-on (255) bytes
  11     - high-bytes (128+)
  12 
  13 The output is TSV (tab-separated values) lines, where the first line has
  14 all the column names.
  15 
  16 When no filepaths are given, the standard input is used by default. All
  17 folder names given expand recursively into all filenames in them.

     File: coby/main.go
   1 package main
   2 
   3 import (
   4     "bufio"
   5     "errors"
   6     "io"
   7     "io/fs"
   8     "os"
   9     "path/filepath"
  10     "runtime"
  11     "strconv"
  12     "sync"
  13 
  14     _ "embed"
  15 )
  16 
  17 // Note: the code is avoiding using the fmt package to save hundreds of
  18 // kilobytes on the resulting executable, which is a noticeable difference.
  19 
  20 //go:embed info.txt
  21 var info string
  22 
  23 // header is the first output line
  24 var header = []string{
  25     `name`,
  26     `bytes`,
  27     `runes`,
  28     `lines`,
  29     `lf`,
  30     `crlf`,
  31     `spaces`,
  32     `tabs`,
  33     `trails`,
  34     `nulls`,
  35     `fulls`,
  36     `highs`,
  37 }
  38 
  39 // event has what the output-reporting task needs to show the results of a
  40 // task which has just completed, perhaps unsuccessfully
  41 type event struct {
  42     // Index points to the task's entry in the results-slice
  43     Index int
  44 
  45     // Err is the completed task's error, or lack of
  46     Err error
  47 }
  48 
  49 func main() {
  50     if len(os.Args) > 1 {
  51         switch os.Args[1] {
  52         case `-h`, `--h`, `-help`, `--help`:
  53             os.Stderr.WriteString(info)
  54             return
  55         }
  56     }
  57 
  58     // show first/heading line right away, to let users know things are
  59     // happening
  60     for i, s := range header {
  61         if i > 0 {
  62             os.Stdout.WriteString("\t")
  63         }
  64         os.Stdout.WriteString(s)
  65     }
  66     // assume an error means later stages/apps in a pipe had enough input and
  67     // quit successfully, so quit successfully too
  68     _, err := os.Stdout.WriteString("\n")
  69     if err != nil {
  70         return
  71     }
  72 
  73     // names has all filepaths given, ignoring repetitions
  74     names, ok := findAllFiles(unique(os.Args[1:]))
  75     if !ok {
  76         os.Exit(1)
  77     }
  78     if len(names) == 0 {
  79         names = []string{`-`}
  80     }
  81 
  82     // results has all its items updated concurrently: this is safe to do,
  83     // as the tasks update values in separate indices of this slice, and
  84     // when an item is ready to show, its values aren't changing anymore
  85     results := make([]stats, len(names))
  86 
  87     events := make(chan event)
  88     go handleInputs(names, results, events)
  89     if !handleOutput(os.Stdout, results, events) {
  90         os.Exit(1)
  91     }
  92 }
  93 
  94 // handleInputs launches all the tasks which do the actual work, limiting how
  95 // many inputs are being worked on at the same time
  96 func handleInputs(names []string, results []stats, events chan event) {
  97     // allow output-reporter task to end, and thus the app
  98     defer close(events)
  99 
 100     // permissions limits how many worker tasks can be active at the same
 101     // time: when given many filepaths to work on, rate-limiting avoids
 102     // a massive number of concurrent tasks which read and process input
 103     permissions := make(chan struct{}, runtime.NumCPU())
 104     defer close(permissions)
 105 
 106     var inputs sync.WaitGroup
 107     for i := range names {
 108         // wait until some concurrency-room is available
 109         permissions <- struct{}{}
 110         inputs.Add(1)
 111 
 112         go func(i int) {
 113             defer inputs.Done()
 114             err := handleInput(&results[i], names[i])
 115             events <- event{i, err}
 116             <-permissions
 117         }(i)
 118     }
 119 
 120     // wait for all inputs, before closing the `events` channel
 121     inputs.Wait()
 122 }
 123 
 124 // handleInput handles each work-item for func handleInputs
 125 func handleInput(res *stats, path string) error {
 126     res.name = path
 127 
 128     if path == `-` {
 129         return res.updateStats(os.Stdin)
 130     }
 131 
 132     f, err := os.Open(path)
 133     if err != nil {
 134         res.result = resultError
 135         // on windows, file-not-found error messages may mention `CreateFile`,
 136         // even when trying to open files in read-only mode
 137         return errors.New(`can't open file named ` + path)
 138     }
 139     defer f.Close()
 140 
 141     return res.updateStats(f)
 142 }
 143 
 144 // handleOutput asynchronously updates output as results are known, whether
 145 // it's errors or successful results; returns whether it succeeded, which
 146 // means no errors happened
 147 func handleOutput(w io.Writer, results []stats, events chan event) (ok bool) {
 148     bw := bufio.NewWriter(w)
 149     defer bw.Flush()
 150 
 151     bw.Flush()
 152 
 153     // keep track of which tasks are over, so that on each event all leading
 154     // results which are ready are shown: all of this ensures prompt output
 155     // updates as soon as results come in, while keeping the original order
 156     // of the names/filepaths given
 157     resultsLeft := results
 158 
 159     for v := range events {
 160         if v.Err != nil {
 161             ok = false
 162             bw.Flush()
 163             showError(v.Err)
 164 
 165             // stay in the current loop, in case this failure was keeping
 166             // previous successes from showing up
 167         }
 168 
 169         n := countLeadingReady(resultsLeft)
 170 
 171         for _, res := range resultsLeft[:n] {
 172             if err := showResult(bw, res); err != nil {
 173                 // assume later stages/apps in a pipe had enough input and
 174                 // quit successfully, so quit successfully too
 175                 return true
 176             }
 177         }
 178         resultsLeft = resultsLeft[n:]
 179 
 180         // flush output-buffer only if anything new was shown
 181         if n > 0 {
 182             bw.Flush()
 183         }
 184     }
 185 
 186     return ok
 187 }
 188 
 189 // showError standardizes how errors from this app look
 190 func showError(err error) {
 191     os.Stderr.WriteString("\x1b[31m")
 192     os.Stderr.WriteString(err.Error())
 193     os.Stderr.WriteString("\x1b[0m\n")
 194 }
 195 
 196 // showResult does what it says
 197 func showResult(w *bufio.Writer, res stats) error {
 198     if res.result == resultError {
 199         return nil
 200     }
 201 
 202     var buf [64]byte
 203     w.WriteString(res.name)
 204     w.Write([]byte{'\t'})
 205     w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10))
 206     w.Write([]byte{'\t'})
 207     w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10))
 208     w.Write([]byte{'\t'})
 209     w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10))
 210     w.Write([]byte{'\t'})
 211     w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10))
 212     w.Write([]byte{'\t'})
 213     w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10))
 214     w.Write([]byte{'\t'})
 215     w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10))
 216     w.Write([]byte{'\t'})
 217     w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10))
 218     w.Write([]byte{'\t'})
 219     w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10))
 220     w.Write([]byte{'\t'})
 221     w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10))
 222     w.Write([]byte{'\t'})
 223     w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10))
 224     w.Write([]byte{'\t'})
 225     w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10))
 226     _, err := w.Write([]byte{'\n'})
 227     return err
 228 }
 229 
 230 // unique ensures items only appear once in the result, keeping the original
 231 // slice unchanged
 232 func unique(src []string) []string {
 233     var unique []string
 234     got := make(map[string]struct{})
 235     for _, s := range src {
 236         if _, ok := got[s]; ok {
 237             continue
 238         }
 239         unique = append(unique, s)
 240         got[s] = struct{}{}
 241     }
 242     return unique
 243 }
 244 
 245 // findAllFiles does what it says, given a mix of file/folder paths, finding
 246 // all files recursively in the case of folders
 247 func findAllFiles(paths []string) (found []string, ok bool) {
 248     var unique []string
 249     got := make(map[string]struct{})
 250     ok = true
 251 
 252     for _, root := range paths {
 253         // a dash means standard input
 254         if root == `-` {
 255             if _, ok := got[root]; ok {
 256                 continue
 257             }
 258 
 259             unique = append(unique, root)
 260             got[root] = struct{}{}
 261             continue
 262         }
 263 
 264         _, err := os.Stat(root)
 265         if os.IsNotExist(err) {
 266             ok = false
 267             // on windows, file-not-found error messages may mention `CreateFile`,
 268             // even when trying to open files in read-only mode
 269             err := errors.New(`can't find file/folder named ` + root)
 270             showError(err)
 271             continue
 272         }
 273 
 274         err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
 275             if err != nil {
 276                 return err
 277             }
 278 
 279             if d.IsDir() {
 280                 return nil
 281             }
 282 
 283             if _, ok := got[path]; ok {
 284                 return nil
 285             }
 286 
 287             unique = append(unique, path)
 288             got[path] = struct{}{}
 289             return nil
 290         })
 291 
 292         if err != nil {
 293             ok = false
 294             showError(err)
 295         }
 296     }
 297 
 298     return unique, ok
 299 }

     File: coby/mit-license.txt
   1 The MIT License (MIT)
   2 
   3 Copyright © 2024 pacman64
   4 
   5 Permission is hereby granted, free of charge, to any person obtaining a copy of
   6 this software and associated documentation files (the “Software”), to deal
   7 in the Software without restriction, including without limitation the rights to
   8 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9 of the Software, and to permit persons to whom the Software is furnished to do
  10 so, subject to the following conditions:
  11 
  12 The above copyright notice and this permission notice shall be included in all
  13 copies or substantial portions of the Software.
  14 
  15 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21 SOFTWARE.

     File: coby/stats.go
   1 package main
   2 
   3 import (
   4     "io"
   5 )
   6 
   7 // isZero enables branchless-counting, when xor-compared bytes are used
   8 // as indices for it
   9 var isZero = [256]byte{1}
  10 
  11 // counter makes it easy to change the int-size of almost all counters
  12 type counter int
  13 
  14 // statResult constrains possible result-states/values in type stats
  15 type statResult int
  16 
  17 const (
  18     // resultPending is the default not-yet-ready result-status
  19     resultPending = statResult(0)
  20 
  21     // resultError signals result should show as an error, instead of data
  22     resultError = statResult(1)
  23 
  24     // resultSuccess means result can be shown
  25     resultSuccess = statResult(2)
  26 )
  27 
  28 // stats has all the size-stats for some input, as well as a way to
  29 // skip showing results, in case of an error such as `file not found`
  30 type stats struct {
  31     // bytes counts all bytes read
  32     bytes int
  33 
  34     // lines counts lines, and is 0 only when the byte-count is also 0
  35     lines counter
  36 
  37     // runes counts utf-8 sequences, each of which can use up to 4 bytes and
  38     // is usually a complete symbol: `emoji` country-flags are commonly-used
  39     // counter-examples, as these `symbols` need 2 runes, using 8 bytes each
  40     runes counter
  41 
  42     // maxWidth is maximum byte-width of lines, excluding carriage-returns
  43     // and/or line-feeds
  44     maxWidth counter
  45 
  46     // nulls counts all-bits-off bytes
  47     nulls counter
  48 
  49     // fulls counts all-bits-on bytes
  50     fulls counter
  51 
  52     // highs counts bytes with their `top` (highest-order) bit on
  53     highs counter
  54 
  55     // spaces counts ASCII spaces
  56     spaces counter
  57 
  58     // tabs counts ASCII tabs
  59     tabs counter
  60 
  61     // trailing counts lines with trailing spaces in them
  62     trailing counter
  63 
  64     // lf counts ASCII line-feeds as their own byte-values: this means its
  65     // value will always be at least the same as field `crlf`
  66     lf counter
  67 
  68     // crlf counts ASCII CRLF byte-pairs
  69     crlf counter
  70 
  71     // name is the filepath of the file/source these stats are about
  72     name string
  73 
  74     // results keeps track of whether results are valid and/or ready
  75     result statResult
  76 }
  77 
  78 // updateStats does what it says, reading everything from a reader
  79 func (res *stats) updateStats(r io.Reader) error {
  80     err := res.updateUsing(r)
  81     if err == io.EOF {
  82         err = nil
  83     }
  84 
  85     if err == nil {
  86         res.result = resultSuccess
  87     } else {
  88         res.result = resultError
  89     }
  90     return err
  91 }
  92 
  93 // updateUsing helps func updateStats do its job
  94 func (res *stats) updateUsing(r io.Reader) error {
  95     var width counter
  96     var highRun int
  97     var prev1, prev2 byte
  98     var buf [16 * 1024]byte
  99     var tallies [256]uint64
 100 
 101     for {
 102         n, err := r.Read(buf[:])
 103         if n < 1 {
 104             if err == io.EOF {
 105                 res.tabs = counter(tallies['\t'])
 106                 res.spaces = counter(tallies[' '])
 107                 res.lf = counter(tallies['\n'])
 108                 res.nulls = counter(tallies[0])
 109                 res.fulls = counter(tallies[255])
 110                 for i := 128; i < 256; i++ {
 111                     res.highs += counter(tallies[i])
 112                 }
 113                 return res.handleEnd(width, prev1, highRun)
 114             }
 115             return err
 116         }
 117 
 118         res.bytes += n
 119         chunk := buf[:n]
 120 
 121         for _, b := range chunk {
 122             // count values without branching, because it's fun
 123             tallies[b]++
 124 
 125             // handle non-ASCII runes, assuming input is valid UTF-8
 126             if b >= 128 {
 127                 if highRun < 3 {
 128                     highRun++
 129                 } else {
 130                     highRun = 0
 131                     res.runes++
 132                     width++
 133                 }
 134 
 135                 prev2 = prev1
 136                 prev1 = b
 137                 continue
 138             }
 139 
 140             // handle line-feeds
 141             if b == '\n' {
 142                 res.lines++
 143 
 144                 crlf := count(prev1, '\r')
 145                 res.crlf += crlf
 146 
 147                 // count lines with trailing spaces, whether these end with
 148                 // a CRLF byte-pair or just a line-feed byte
 149                 res.trailing += count(prev1, ' ')
 150                 res.trailing += crlf & count(prev2, ' ')
 151 
 152                 // exclude any CR from the current line's width-count
 153                 width -= crlf
 154                 if res.maxWidth < width {
 155                     res.maxWidth = width
 156                 }
 157 
 158                 prev2 = prev1
 159                 prev1 = b
 160 
 161                 res.runes++
 162                 highRun = 0
 163                 width = 0
 164                 continue
 165             }
 166 
 167             prev2 = prev1
 168             prev1 = b
 169 
 170             res.runes++
 171             highRun = 0
 172             width++
 173         }
 174     }
 175 }
 176 
 177 // handleEnd fixes/finalizes stats when input data end; this func is only
 178 // meant to be used by func updateStats, since it takes some of the latter's
 179 // local variables
 180 func (res *stats) handleEnd(width counter, prev1 byte, highRun int) error {
 181     if prev1 == ' ' {
 182         res.trailing++
 183     }
 184 
 185     if res.maxWidth < width {
 186         res.maxWidth = width
 187     }
 188 
 189     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 190     // standard cmd-line tool `wc`
 191     if res.bytes > 0 && prev1 != '\n' {
 192         res.lines++
 193     }
 194 
 195     if highRun > 0 {
 196         res.runes++
 197     }
 198     return nil
 199 }
 200 
 201 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 202 // be added directly/branchlessly to totals
 203 func count(x, y byte) counter {
 204     return counter(isZero[x^y])
 205 }
 206 
 207 // countLeadingReady finds how many items are ready to show at the start of a
 208 // results-slice, which ensures output matches the original item-order
 209 func countLeadingReady(values []stats) int {
 210     for i, v := range values {
 211         if v.result == resultPending {
 212             return i
 213         }
 214     }
 215     return len(values)
 216 }

     File: coby/stats_test.go
   1 package main
   2 
   3 import (
   4     "strings"
   5     "testing"
   6 )
   7 
   8 func TestCount(t *testing.T) {
   9     for x := 0; x < 256; x++ {
  10         for y := 0; y < 256; y++ {
  11             var exp counter
  12             if x == y {
  13                 exp = 1
  14             }
  15 
  16             if got := count(byte(x), byte(y)); got != exp {
  17                 t.Fatalf(`%d, %d: expected %v, but got %v`, x, y, exp, got)
  18                 return
  19             }
  20         }
  21     }
  22 }
  23 
  24 func TestCountLeadingReady(t *testing.T) {
  25     for size := 0; size <= 20; size++ {
  26         for exp := 0; exp < size; exp++ {
  27             values := make([]stats, size)
  28             for i := 0; i < exp; i++ {
  29                 v := resultSuccess
  30                 if i%2 == 1 {
  31                     v = resultError
  32                 }
  33                 values[i].result = v
  34             }
  35 
  36             if got := countLeadingReady(values); got != exp {
  37                 const fs = `size %d: expected %d, instead of %d`
  38                 t.Fatalf(fs, size, exp, got)
  39             }
  40         }
  41     }
  42 }
  43 
  44 func TestStats(t *testing.T) {
  45     var tests = []struct {
  46         Input    string
  47         Expected stats
  48     }{
  49         {
  50             ``,
  51             stats{},
  52         },
  53         {
  54             `abc`,
  55             stats{lines: 1, runes: 3, maxWidth: 3},
  56         },
  57         {
  58             "abc\tdef\r\n",
  59             stats{lines: 1, runes: 9, maxWidth: 7, tabs: 1, lf: 1, crlf: 1},
  60         },
  61         {
  62             "abc\tdef\r\n",
  63             stats{lines: 1, runes: 9, maxWidth: 7, tabs: 1, lf: 1, crlf: 1},
  64         },
  65         {
  66             "abc\tdef \r\n123\t456  789 ",
  67             stats{
  68                 lines: 2, runes: 23, maxWidth: 13,
  69                 spaces: 4, tabs: 2, trailing: 2, lf: 1, crlf: 1,
  70             },
  71         },
  72     }
  73 
  74     for _, tc := range tests {
  75         t.Run(tc.Input, func(t *testing.T) {
  76             var got stats
  77             err := got.updateStats(strings.NewReader(tc.Input))
  78             if err != nil {
  79                 t.Error(err)
  80                 return
  81             }
  82 
  83             tc.Expected.bytes = len(tc.Input)
  84             tc.Expected.result = resultSuccess
  85             if got != tc.Expected {
  86                 t.Fatalf("expected\n%#v,\ngot\n%#v", tc.Expected, got)
  87                 return
  88             }
  89         })
  90     }
  91 }