File: coby/info.txt
   1 coby [files/folders...]
   2 
   3 COunt BYtes finds out some simple byte-related stats, counting
   4 
   5     - bytes
   6     - lines
   7     - how many lines have trailing spaces
   8     - how many lines end with a CRLF pair
   9     - all-off (0) bytes
  10     - all-on (255) bytes
  11     - high-bytes (128+)
  12 
  13 The output is TSV (tab-separated values) lines, where the first line has
  14 all the column names.
  15 
  16 When no filepaths are given, the standard input is used by default. All
  17 folder names given expand recursively into all filenames in them.

     File: coby/main.go
   1 package main
   2 
   3 import (
   4     "bufio"
   5     "errors"
   6     "io"
   7     "io/fs"
   8     "os"
   9     "path/filepath"
  10     "runtime"
  11     "strconv"
  12     "sync"
  13 
  14     _ "embed"
  15 )
  16 
  17 // Note: the code is avoiding using the fmt package to save hundreds of
  18 // kilobytes on the resulting executable, which is a noticeable difference.
  19 
  20 //go:embed info.txt
  21 var info string
  22 
  23 // header is the first output line
  24 var header = []string{
  25     `name`,
  26     `bytes`,
  27     `runes`,
  28     `lines`,
  29     `lf`,
  30     `crlf`,
  31     `spaces`,
  32     `tabs`,
  33     `trails`,
  34     `nulls`,
  35     `fulls`,
  36     `highs`,
  37 }
  38 
  39 // event has what the output-reporting task needs to show the results of a
  40 // task which has just completed, perhaps unsuccessfully
  41 type event struct {
  42     // Index points to the task's entry in the results-slice
  43     Index int
  44 
  45     // Stats has all the byte-related stats
  46     Stats stats
  47 
  48     // Err is the completed task's error, or lack of
  49     Err error
  50 }
  51 
  52 func main() {
  53     if len(os.Args) > 1 {
  54         switch os.Args[1] {
  55         case `-h`, `--h`, `-help`, `--help`:
  56             os.Stderr.WriteString(info)
  57             return
  58         }
  59     }
  60 
  61     // show first/heading line right away, to let users know things are
  62     // happening
  63     for i, s := range header {
  64         if i > 0 {
  65             os.Stdout.WriteString("\t")
  66         }
  67         os.Stdout.WriteString(s)
  68     }
  69     // assume an error means later stages/apps in a pipe had enough input and
  70     // quit successfully, so quit successfully too
  71     _, err := os.Stdout.WriteString("\n")
  72     if err != nil {
  73         return
  74     }
  75 
  76     // names has all filepaths given, ignoring repetitions
  77     names, ok := findAllFiles(unique(os.Args[1:]))
  78     if !ok {
  79         os.Exit(1)
  80     }
  81     if len(names) == 0 {
  82         names = []string{`-`}
  83     }
  84 
  85     events := make(chan event)
  86     go handleInputs(names, events)
  87     if !handleOutput(os.Stdout, len(names), events) {
  88         os.Exit(1)
  89     }
  90 }
  91 
  92 // handleInputs launches all the tasks which do the actual work, limiting how
  93 // many inputs are being worked on at the same time
  94 func handleInputs(names []string, events chan event) {
  95     // allow output-reporter task to end, and thus the app
  96     defer close(events)
  97 
  98     // permissions limits how many worker tasks can be active at the same
  99     // time: when given many filepaths to work on, rate-limiting avoids
 100     // a massive number of concurrent tasks which read and process input
 101     permissions := make(chan struct{}, runtime.NumCPU())
 102     defer close(permissions)
 103 
 104     var inputs sync.WaitGroup
 105     for i := range names {
 106         // wait until some concurrency-room is available
 107         permissions <- struct{}{}
 108         inputs.Add(1)
 109 
 110         go func(i int) {
 111             defer inputs.Done()
 112             res, err := handleInput(names[i])
 113             events <- event{i, res, err}
 114             <-permissions
 115         }(i)
 116     }
 117 
 118     // wait for all inputs, before closing the `events` channel
 119     inputs.Wait()
 120 }
 121 
 122 // handleInput handles each work-item for func handleInputs
 123 func handleInput(path string) (stats, error) {
 124     var res stats
 125     res.name = path
 126 
 127     if path == `-` {
 128         err := res.updateStats(os.Stdin)
 129         return res, err
 130     }
 131 
 132     f, err := os.Open(path)
 133     if err != nil {
 134         res.result = resultError
 135         // on windows, file-not-found error messages may mention `CreateFile`,
 136         // even when trying to open files in read-only mode
 137         return res, errors.New(`can't open file named ` + path)
 138     }
 139     defer f.Close()
 140 
 141     err = res.updateStats(f)
 142     return res, err
 143 }
 144 
 145 // handleOutput asynchronously updates output as results are known, whether
 146 // it's errors or successful results; returns whether it succeeded, which
 147 // means no errors happened
 148 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) {
 149     ok = true
 150     bw := bufio.NewWriter(w)
 151     defer bw.Flush()
 152 
 153     results := make([]stats, rescount)
 154 
 155     // keep track of which tasks are over, so that on each event all leading
 156     // results which are ready are shown: all of this ensures prompt output
 157     // updates as soon as results come in, while keeping the original order
 158     // of the names/filepaths given
 159     resultsLeft := results
 160 
 161     for v := range events {
 162         results[v.Index] = v.Stats
 163         if v.Err != nil {
 164             ok = false
 165             bw.Flush()
 166             showError(v.Err)
 167 
 168             // stay in the current loop, in case this failure was keeping
 169             // previous successes from showing up
 170         }
 171 
 172         n := countLeadingReady(resultsLeft)
 173 
 174         for _, res := range resultsLeft[:n] {
 175             if err := showResult(bw, res); err != nil {
 176                 // assume later stages/apps in a pipe had enough input and
 177                 // quit successfully, so quit successfully too
 178                 return true
 179             }
 180         }
 181         resultsLeft = resultsLeft[n:]
 182 
 183         // flush output-buffer only if anything new was shown
 184         if n > 0 {
 185             bw.Flush()
 186         }
 187     }
 188 
 189     return ok
 190 }
 191 
 192 // showError standardizes how errors from this app look
 193 func showError(err error) {
 194     os.Stderr.WriteString("\x1b[31m")
 195     os.Stderr.WriteString(err.Error())
 196     os.Stderr.WriteString("\x1b[0m\n")
 197 }
 198 
 199 // showResult does what it says
 200 func showResult(w *bufio.Writer, res stats) error {
 201     if res.result == resultError {
 202         return nil
 203     }
 204 
 205     var buf [64]byte
 206     w.WriteString(res.name)
 207     w.Write([]byte{'\t'})
 208     w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10))
 209     w.Write([]byte{'\t'})
 210     w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10))
 211     w.Write([]byte{'\t'})
 212     w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10))
 213     w.Write([]byte{'\t'})
 214     w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10))
 215     w.Write([]byte{'\t'})
 216     w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10))
 217     w.Write([]byte{'\t'})
 218     w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10))
 219     w.Write([]byte{'\t'})
 220     w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10))
 221     w.Write([]byte{'\t'})
 222     w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10))
 223     w.Write([]byte{'\t'})
 224     w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10))
 225     w.Write([]byte{'\t'})
 226     w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10))
 227     w.Write([]byte{'\t'})
 228     w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10))
 229     _, err := w.Write([]byte{'\n'})
 230     return err
 231 }
 232 
 233 // unique ensures items only appear once in the result, keeping the original
 234 // slice unchanged
 235 func unique(src []string) []string {
 236     var unique []string
 237     got := make(map[string]struct{})
 238     for _, s := range src {
 239         if _, ok := got[s]; ok {
 240             continue
 241         }
 242         unique = append(unique, s)
 243         got[s] = struct{}{}
 244     }
 245     return unique
 246 }
 247 
 248 // findAllFiles does what it says, given a mix of file/folder paths, finding
 249 // all files recursively in the case of folders
 250 func findAllFiles(paths []string) (found []string, ok bool) {
 251     var unique []string
 252     got := make(map[string]struct{})
 253     ok = true
 254 
 255     for _, root := range paths {
 256         // a dash means standard input
 257         if root == `-` {
 258             if _, ok := got[root]; ok {
 259                 continue
 260             }
 261 
 262             unique = append(unique, root)
 263             got[root] = struct{}{}
 264             continue
 265         }
 266 
 267         _, err := os.Stat(root)
 268         if os.IsNotExist(err) {
 269             ok = false
 270             // on windows, file-not-found error messages may mention `CreateFile`,
 271             // even when trying to open files in read-only mode
 272             err := errors.New(`can't find file/folder named ` + root)
 273             showError(err)
 274             continue
 275         }
 276 
 277         err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
 278             if err != nil {
 279                 return err
 280             }
 281 
 282             if d.IsDir() {
 283                 return nil
 284             }
 285 
 286             if _, ok := got[path]; ok {
 287                 return nil
 288             }
 289 
 290             unique = append(unique, path)
 291             got[path] = struct{}{}
 292             return nil
 293         })
 294 
 295         if err != nil {
 296             ok = false
 297             showError(err)
 298         }
 299     }
 300 
 301     return unique, ok
 302 }

     File: coby/mit-license.txt
   1 The MIT License (MIT)
   2 
   3 Copyright © 2024 pacman64
   4 
   5 Permission is hereby granted, free of charge, to any person obtaining a copy of
   6 this software and associated documentation files (the “Software”), to deal
   7 in the Software without restriction, including without limitation the rights to
   8 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9 of the Software, and to permit persons to whom the Software is furnished to do
  10 so, subject to the following conditions:
  11 
  12 The above copyright notice and this permission notice shall be included in all
  13 copies or substantial portions of the Software.
  14 
  15 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21 SOFTWARE.

     File: coby/stats.go
   1 package main
   2 
   3 import (
   4     "io"
   5 )
   6 
   7 // isZero enables branchless-counting, when xor-compared bytes are used
   8 // as indices for it
   9 var isZero = [256]byte{1}
  10 
  11 // counter makes it easy to change the int-size of almost all counters
  12 type counter int
  13 
  14 // statResult constrains possible result-states/values in type stats
  15 type statResult int
  16 
  17 const (
  18     // resultPending is the default not-yet-ready result-status
  19     resultPending = statResult(0)
  20 
  21     // resultError signals result should show as an error, instead of data
  22     resultError = statResult(1)
  23 
  24     // resultSuccess means result can be shown
  25     resultSuccess = statResult(2)
  26 )
  27 
  28 // stats has all the size-stats for some input, as well as a way to
  29 // skip showing results, in case of an error such as `file not found`
  30 type stats struct {
  31     // bytes counts all bytes read
  32     bytes int
  33 
  34     // lines counts lines, and is 0 only when the byte-count is also 0
  35     lines counter
  36 
  37     // runes counts utf-8 sequences, each of which can use up to 4 bytes and
  38     // is usually a complete symbol: `emoji` country-flags are commonly-used
  39     // counter-examples, as these `symbols` need 2 runes, using 8 bytes each
  40     runes counter
  41 
  42     // maxWidth is maximum byte-width of lines, excluding carriage-returns
  43     // and/or line-feeds
  44     maxWidth counter
  45 
  46     // nulls counts all-bits-off bytes
  47     nulls counter
  48 
  49     // fulls counts all-bits-on bytes
  50     fulls counter
  51 
  52     // highs counts bytes with their `top` (highest-order) bit on
  53     highs counter
  54 
  55     // spaces counts ASCII spaces
  56     spaces counter
  57 
  58     // tabs counts ASCII tabs
  59     tabs counter
  60 
  61     // trailing counts lines with trailing spaces in them
  62     trailing counter
  63 
  64     // lf counts ASCII line-feeds as their own byte-values: this means its
  65     // value will always be at least the same as field `crlf`
  66     lf counter
  67 
  68     // crlf counts ASCII CRLF byte-pairs
  69     crlf counter
  70 
  71     // name is the filepath of the file/source these stats are about
  72     name string
  73 
  74     // results keeps track of whether results are valid and/or ready
  75     result statResult
  76 }
  77 
  78 // updateStats does what it says, reading everything from a reader
  79 func (res *stats) updateStats(r io.Reader) error {
  80     err := res.updateUsing(r)
  81     if err == io.EOF {
  82         err = nil
  83     }
  84 
  85     if err == nil {
  86         res.result = resultSuccess
  87     } else {
  88         res.result = resultError
  89     }
  90     return err
  91 }
  92 
  93 // updateUsing helps func updateStats do its job
  94 func (res *stats) updateUsing(r io.Reader) error {
  95     var width counter
  96     var prev1, prev2 byte
  97     var buf [16 * 1024]byte
  98     var tallies [256]uint64
  99 
 100     for {
 101         n, err := r.Read(buf[:])
 102         if n < 1 {
 103             if err == io.EOF {
 104                 res.lines = counter(tallies['\n'])
 105                 res.tabs = counter(tallies['\t'])
 106                 res.spaces = counter(tallies[' '])
 107                 res.lf = counter(tallies['\n'])
 108                 res.nulls = counter(tallies[0])
 109                 res.fulls = counter(tallies[255])
 110                 for i := 128; i < 256; i++ {
 111                     res.highs += counter(tallies[i])
 112                 }
 113                 return res.handleEnd(width, prev1, prev2)
 114             }
 115             return err
 116         }
 117 
 118         res.bytes += n
 119         chunk := buf[:n]
 120 
 121         for _, b := range chunk {
 122             // count values without branching, because it's fun
 123             tallies[b]++
 124 
 125             // handle non-ASCII runes, assuming input is valid UTF-8
 126             res.runes += 1 - count(b & 0xc0, 0x80)
 127 
 128             // handle line-feeds
 129             if b == '\n' {
 130                 crlf := count(prev1, '\r')
 131                 res.crlf += crlf
 132 
 133                 // count lines with trailing spaces, whether these end with
 134                 // a CRLF byte-pair or just a line-feed byte
 135                 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 136                     res.trailing++
 137                 }
 138 
 139                 // exclude any CR from the current line's width-count
 140                 width -= crlf
 141                 if res.maxWidth < width {
 142                     res.maxWidth = width
 143                 }
 144 
 145                 prev2 = prev1
 146                 prev1 = b
 147                 width = 0
 148                 continue
 149             }
 150 
 151             prev2 = prev1
 152             prev1 = b
 153             width++
 154         }
 155     }
 156 }
 157 
 158 // handleEnd fixes/finalizes stats when input data end; this func is only
 159 // meant to be used by func updateStats, since it takes some of the latter's
 160 // local variables
 161 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error {
 162     if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') {
 163         res.trailing++
 164     }
 165 
 166     if res.maxWidth < width {
 167         res.maxWidth = width
 168     }
 169 
 170     // avoid reporting 0 lines with a non-0 byte-count: this is unlike the
 171     // standard cmd-line tool `wc`
 172     if res.bytes > 0 && prev1 != '\n' {
 173         res.lines++
 174     }
 175 
 176     return nil
 177 }
 178 
 179 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 180 // be added directly/branchlessly to totals
 181 // func count(x, y byte) counter {
 182 //  return counter(isZero[x^y])
 183 // }
 184 
 185 // count checks if 2 bytes are the same, returning either 0 or 1, which can
 186 // be added directly/branchlessly to totals
 187 func count(x, y byte) counter {
 188     if (x != y) {
 189         return 0
 190     }
 191     return 1
 192 }
 193 
 194 // countLeadingReady finds how many items are ready to show at the start of a
 195 // results-slice, which ensures output matches the original item-order
 196 func countLeadingReady(values []stats) int {
 197     for i, v := range values {
 198         if v.result == resultPending {
 199             return i
 200         }
 201     }
 202     return len(values)
 203 }

     File: coby/stats_test.go
   1 package main
   2 
   3 import (
   4     "strings"
   5     "testing"
   6 )
   7 
   8 func TestCount(t *testing.T) {
   9     for x := 0; x < 256; x++ {
  10         for y := 0; y < 256; y++ {
  11             var exp counter
  12             if x == y {
  13                 exp = 1
  14             }
  15 
  16             if got := count(byte(x), byte(y)); got != exp {
  17                 t.Fatalf(`%d, %d: expected %v, but got %v`, x, y, exp, got)
  18                 return
  19             }
  20         }
  21     }
  22 }
  23 
  24 func TestCountLeadingReady(t *testing.T) {
  25     for size := 0; size <= 20; size++ {
  26         for exp := 0; exp < size; exp++ {
  27             values := make([]stats, size)
  28             for i := 0; i < exp; i++ {
  29                 v := resultSuccess
  30                 if i%2 == 1 {
  31                     v = resultError
  32                 }
  33                 values[i].result = v
  34             }
  35 
  36             if got := countLeadingReady(values); got != exp {
  37                 const fs = `size %d: expected %d, instead of %d`
  38                 t.Fatalf(fs, size, exp, got)
  39             }
  40         }
  41     }
  42 }
  43 
  44 func TestStats(t *testing.T) {
  45     var tests = []struct {
  46         Input    string
  47         Expected stats
  48     }{
  49         {
  50             ``,
  51             stats{},
  52         },
  53         {
  54             `abc`,
  55             stats{lines: 1, runes: 3, maxWidth: 3},
  56         },
  57         {
  58             "abc\tdef\r\n",
  59             stats{lines: 1, runes: 9, maxWidth: 7, tabs: 1, lf: 1, crlf: 1},
  60         },
  61         {
  62             "abc\tdef\r\n",
  63             stats{lines: 1, runes: 9, maxWidth: 7, tabs: 1, lf: 1, crlf: 1},
  64         },
  65         {
  66             "abc\tdef \r\n123\t456  789 ",
  67             stats{
  68                 lines: 2, runes: 23, maxWidth: 13,
  69                 spaces: 4, tabs: 2, trailing: 2, lf: 1, crlf: 1,
  70             },
  71         },
  72     }
  73 
  74     for _, tc := range tests {
  75         t.Run(tc.Input, func(t *testing.T) {
  76             var got stats
  77             err := got.updateStats(strings.NewReader(tc.Input))
  78             if err != nil {
  79                 t.Error(err)
  80                 return
  81             }
  82 
  83             tc.Expected.bytes = len(tc.Input)
  84             tc.Expected.result = resultSuccess
  85             if got != tc.Expected {
  86                 t.Fatalf("expected\n%#v,\ngot\n%#v", tc.Expected, got)
  87                 return
  88             }
  89         })
  90     }
  91 }