File: coby/info.txt 1 coby [files/folders...] 2 3 COunt BYtes finds out some simple byte-related stats, counting 4 5 - bytes 6 - lines 7 - how many lines have trailing spaces 8 - how many lines end with a CRLF pair 9 - all-off (0) bytes 10 - all-on (255) bytes 11 - high-bytes (128+) 12 13 The output is TSV (tab-separated values) lines, where the first line has 14 all the column names. 15 16 When no filepaths are given, the standard input is used by default. All 17 folder names given expand recursively into all filenames in them. File: coby/main.go 1 package main 2 3 import ( 4 "bufio" 5 "errors" 6 "io" 7 "io/fs" 8 "os" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "sync" 13 14 _ "embed" 15 ) 16 17 // Note: the code is avoiding using the fmt package to save hundreds of 18 // kilobytes on the resulting executable, which is a noticeable difference. 19 20 //go:embed info.txt 21 var info string 22 23 // header is the first output line 24 var header = []string{ 25 `name`, 26 `bytes`, 27 `runes`, 28 `lines`, 29 `lf`, 30 `crlf`, 31 `spaces`, 32 `tabs`, 33 `trails`, 34 `nulls`, 35 `fulls`, 36 `highs`, 37 } 38 39 // event has what the output-reporting task needs to show the results of a 40 // task which has just completed, perhaps unsuccessfully 41 type event struct { 42 // Index points to the task's entry in the results-slice 43 Index int 44 45 // Stats has all the byte-related stats 46 Stats stats 47 48 // Err is the completed task's error, or lack of 49 Err error 50 } 51 52 func main() { 53 if len(os.Args) > 1 { 54 switch os.Args[1] { 55 case `-h`, `--h`, `-help`, `--help`: 56 os.Stderr.WriteString(info) 57 return 58 } 59 } 60 61 // show first/heading line right away, to let users know things are 62 // happening 63 for i, s := range header { 64 if i > 0 { 65 os.Stdout.WriteString("\t") 66 } 67 os.Stdout.WriteString(s) 68 } 69 // assume an error means later stages/apps in a pipe had enough input and 70 // quit successfully, so quit successfully too 71 _, err := os.Stdout.WriteString("\n") 72 if err != nil { 73 return 74 } 75 76 // names has all filepaths given, ignoring repetitions 77 names, ok := findAllFiles(unique(os.Args[1:])) 78 if !ok { 79 os.Exit(1) 80 } 81 if len(names) == 0 { 82 names = []string{`-`} 83 } 84 85 events := make(chan event) 86 go handleInputs(names, events) 87 if !handleOutput(os.Stdout, len(names), events) { 88 os.Exit(1) 89 } 90 } 91 92 // handleInputs launches all the tasks which do the actual work, limiting how 93 // many inputs are being worked on at the same time 94 func handleInputs(names []string, events chan event) { 95 // allow output-reporter task to end, and thus the app 96 defer close(events) 97 98 // permissions limits how many worker tasks can be active at the same 99 // time: when given many filepaths to work on, rate-limiting avoids 100 // a massive number of concurrent tasks which read and process input 101 permissions := make(chan struct{}, runtime.NumCPU()) 102 defer close(permissions) 103 104 var inputs sync.WaitGroup 105 for i := range names { 106 // wait until some concurrency-room is available 107 permissions <- struct{}{} 108 inputs.Add(1) 109 110 go func(i int) { 111 defer inputs.Done() 112 res, err := handleInput(names[i]) 113 events <- event{i, res, err} 114 <-permissions 115 }(i) 116 } 117 118 // wait for all inputs, before closing the `events` channel 119 inputs.Wait() 120 } 121 122 // handleInput handles each work-item for func handleInputs 123 func handleInput(path string) (stats, error) { 124 var res stats 125 res.name = path 126 127 if path == `-` { 128 err := res.updateStats(os.Stdin) 129 return res, err 130 } 131 132 f, err := os.Open(path) 133 if err != nil { 134 res.result = resultError 135 // on windows, file-not-found error messages may mention `CreateFile`, 136 // even when trying to open files in read-only mode 137 return res, errors.New(`can't open file named ` + path) 138 } 139 defer f.Close() 140 141 err = res.updateStats(f) 142 return res, err 143 } 144 145 // handleOutput asynchronously updates output as results are known, whether 146 // it's errors or successful results; returns whether it succeeded, which 147 // means no errors happened 148 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) { 149 ok = true 150 bw := bufio.NewWriter(w) 151 defer bw.Flush() 152 153 results := make([]stats, rescount) 154 155 // keep track of which tasks are over, so that on each event all leading 156 // results which are ready are shown: all of this ensures prompt output 157 // updates as soon as results come in, while keeping the original order 158 // of the names/filepaths given 159 resultsLeft := results 160 161 for v := range events { 162 results[v.Index] = v.Stats 163 if v.Err != nil { 164 ok = false 165 bw.Flush() 166 showError(v.Err) 167 168 // stay in the current loop, in case this failure was keeping 169 // previous successes from showing up 170 } 171 172 n := countLeadingReady(resultsLeft) 173 174 for _, res := range resultsLeft[:n] { 175 if err := showResult(bw, res); err != nil { 176 // assume later stages/apps in a pipe had enough input and 177 // quit successfully, so quit successfully too 178 return true 179 } 180 } 181 resultsLeft = resultsLeft[n:] 182 183 // flush output-buffer only if anything new was shown 184 if n > 0 { 185 bw.Flush() 186 } 187 } 188 189 return ok 190 } 191 192 // showError standardizes how errors from this app look 193 func showError(err error) { 194 os.Stderr.WriteString("\x1b[31m") 195 os.Stderr.WriteString(err.Error()) 196 os.Stderr.WriteString("\x1b[0m\n") 197 } 198 199 // showResult does what it says 200 func showResult(w *bufio.Writer, res stats) error { 201 if res.result == resultError { 202 return nil 203 } 204 205 var buf [64]byte 206 w.WriteString(res.name) 207 w.Write([]byte{'\t'}) 208 w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10)) 209 w.Write([]byte{'\t'}) 210 w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10)) 211 w.Write([]byte{'\t'}) 212 w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10)) 213 w.Write([]byte{'\t'}) 214 w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10)) 215 w.Write([]byte{'\t'}) 216 w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10)) 217 w.Write([]byte{'\t'}) 218 w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10)) 219 w.Write([]byte{'\t'}) 220 w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10)) 221 w.Write([]byte{'\t'}) 222 w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10)) 223 w.Write([]byte{'\t'}) 224 w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10)) 225 w.Write([]byte{'\t'}) 226 w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10)) 227 w.Write([]byte{'\t'}) 228 w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10)) 229 _, err := w.Write([]byte{'\n'}) 230 return err 231 } 232 233 // unique ensures items only appear once in the result, keeping the original 234 // slice unchanged 235 func unique(src []string) []string { 236 var unique []string 237 got := make(map[string]struct{}) 238 for _, s := range src { 239 if _, ok := got[s]; ok { 240 continue 241 } 242 unique = append(unique, s) 243 got[s] = struct{}{} 244 } 245 return unique 246 } 247 248 // findAllFiles does what it says, given a mix of file/folder paths, finding 249 // all files recursively in the case of folders 250 func findAllFiles(paths []string) (found []string, ok bool) { 251 var unique []string 252 got := make(map[string]struct{}) 253 ok = true 254 255 for _, root := range paths { 256 // a dash means standard input 257 if root == `-` { 258 if _, ok := got[root]; ok { 259 continue 260 } 261 262 unique = append(unique, root) 263 got[root] = struct{}{} 264 continue 265 } 266 267 _, err := os.Stat(root) 268 if os.IsNotExist(err) { 269 ok = false 270 // on windows, file-not-found error messages may mention `CreateFile`, 271 // even when trying to open files in read-only mode 272 err := errors.New(`can't find file/folder named ` + root) 273 showError(err) 274 continue 275 } 276 277 err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { 278 if err != nil { 279 return err 280 } 281 282 if d.IsDir() { 283 return nil 284 } 285 286 if _, ok := got[path]; ok { 287 return nil 288 } 289 290 unique = append(unique, path) 291 got[path] = struct{}{} 292 return nil 293 }) 294 295 if err != nil { 296 ok = false 297 showError(err) 298 } 299 } 300 301 return unique, ok 302 } File: coby/mit-license.txt 1 The MIT License (MIT) 2 3 Copyright © 2024 pacman64 4 5 Permission is hereby granted, free of charge, to any person obtaining a copy of 6 this software and associated documentation files (the “Software”), to deal 7 in the Software without restriction, including without limitation the rights to 8 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 9 of the Software, and to permit persons to whom the Software is furnished to do 10 so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in all 13 copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 SOFTWARE. File: coby/stats.go 1 package main 2 3 import ( 4 "io" 5 ) 6 7 // isZero enables branchless-counting, when xor-compared bytes are used 8 // as indices for it 9 var isZero = [256]byte{1} 10 11 // counter makes it easy to change the int-size of almost all counters 12 type counter int 13 14 // statResult constrains possible result-states/values in type stats 15 type statResult int 16 17 const ( 18 // resultPending is the default not-yet-ready result-status 19 resultPending = statResult(0) 20 21 // resultError signals result should show as an error, instead of data 22 resultError = statResult(1) 23 24 // resultSuccess means result can be shown 25 resultSuccess = statResult(2) 26 ) 27 28 // stats has all the size-stats for some input, as well as a way to 29 // skip showing results, in case of an error such as `file not found` 30 type stats struct { 31 // bytes counts all bytes read 32 bytes int 33 34 // lines counts lines, and is 0 only when the byte-count is also 0 35 lines counter 36 37 // runes counts utf-8 sequences, each of which can use up to 4 bytes and 38 // is usually a complete symbol: `emoji` country-flags are commonly-used 39 // counter-examples, as these `symbols` need 2 runes, using 8 bytes each 40 runes counter 41 42 // maxWidth is maximum byte-width of lines, excluding carriage-returns 43 // and/or line-feeds 44 maxWidth counter 45 46 // nulls counts all-bits-off bytes 47 nulls counter 48 49 // fulls counts all-bits-on bytes 50 fulls counter 51 52 // highs counts bytes with their `top` (highest-order) bit on 53 highs counter 54 55 // spaces counts ASCII spaces 56 spaces counter 57 58 // tabs counts ASCII tabs 59 tabs counter 60 61 // trailing counts lines with trailing spaces in them 62 trailing counter 63 64 // lf counts ASCII line-feeds as their own byte-values: this means its 65 // value will always be at least the same as field `crlf` 66 lf counter 67 68 // crlf counts ASCII CRLF byte-pairs 69 crlf counter 70 71 // name is the filepath of the file/source these stats are about 72 name string 73 74 // results keeps track of whether results are valid and/or ready 75 result statResult 76 } 77 78 // updateStats does what it says, reading everything from a reader 79 func (res *stats) updateStats(r io.Reader) error { 80 err := res.updateUsing(r) 81 if err == io.EOF { 82 err = nil 83 } 84 85 if err == nil { 86 res.result = resultSuccess 87 } else { 88 res.result = resultError 89 } 90 return err 91 } 92 93 // updateUsing helps func updateStats do its job 94 func (res *stats) updateUsing(r io.Reader) error { 95 var width counter 96 var prev1, prev2 byte 97 var buf [16 * 1024]byte 98 var tallies [256]uint64 99 100 for { 101 n, err := r.Read(buf[:]) 102 if n < 1 { 103 if err == io.EOF { 104 res.lines = counter(tallies['\n']) 105 res.tabs = counter(tallies['\t']) 106 res.spaces = counter(tallies[' ']) 107 res.lf = counter(tallies['\n']) 108 res.nulls = counter(tallies[0]) 109 res.fulls = counter(tallies[255]) 110 for i := 128; i < 256; i++ { 111 res.highs += counter(tallies[i]) 112 } 113 return res.handleEnd(width, prev1, prev2) 114 } 115 return err 116 } 117 118 res.bytes += n 119 chunk := buf[:n] 120 121 for _, b := range chunk { 122 // count values without branching, because it's fun 123 tallies[b]++ 124 125 // handle non-ASCII runes, assuming input is valid UTF-8 126 res.runes += 1 - count(b & 0xc0, 0x80) 127 128 // handle line-feeds 129 if b == '\n' { 130 crlf := count(prev1, '\r') 131 res.crlf += crlf 132 133 // count lines with trailing spaces, whether these end with 134 // a CRLF byte-pair or just a line-feed byte 135 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 136 res.trailing++ 137 } 138 139 // exclude any CR from the current line's width-count 140 width -= crlf 141 if res.maxWidth < width { 142 res.maxWidth = width 143 } 144 145 prev2 = prev1 146 prev1 = b 147 width = 0 148 continue 149 } 150 151 prev2 = prev1 152 prev1 = b 153 width++ 154 } 155 } 156 } 157 158 // handleEnd fixes/finalizes stats when input data end; this func is only 159 // meant to be used by func updateStats, since it takes some of the latter's 160 // local variables 161 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 162 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 163 res.trailing++ 164 } 165 166 if res.maxWidth < width { 167 res.maxWidth = width 168 } 169 170 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 171 // standard cmd-line tool `wc` 172 if res.bytes > 0 && prev1 != '\n' { 173 res.lines++ 174 } 175 176 return nil 177 } 178 179 // count checks if 2 bytes are the same, returning either 0 or 1, which can 180 // be added directly/branchlessly to totals 181 // func count(x, y byte) counter { 182 // return counter(isZero[x^y]) 183 // } 184 185 // count checks if 2 bytes are the same, returning either 0 or 1, which can 186 // be added directly/branchlessly to totals 187 func count(x, y byte) counter { 188 if (x != y) { 189 return 0 190 } 191 return 1 192 } 193 194 // countLeadingReady finds how many items are ready to show at the start of a 195 // results-slice, which ensures output matches the original item-order 196 func countLeadingReady(values []stats) int { 197 for i, v := range values { 198 if v.result == resultPending { 199 return i 200 } 201 } 202 return len(values) 203 } File: coby/stats_test.go 1 package main 2 3 import ( 4 "strings" 5 "testing" 6 ) 7 8 func TestCount(t *testing.T) { 9 for x := 0; x < 256; x++ { 10 for y := 0; y < 256; y++ { 11 var exp counter 12 if x == y { 13 exp = 1 14 } 15 16 if got := count(byte(x), byte(y)); got != exp { 17 t.Fatalf(`%d, %d: expected %v, but got %v`, x, y, exp, got) 18 return 19 } 20 } 21 } 22 } 23 24 func TestCountLeadingReady(t *testing.T) { 25 for size := 0; size <= 20; size++ { 26 for exp := 0; exp < size; exp++ { 27 values := make([]stats, size) 28 for i := 0; i < exp; i++ { 29 v := resultSuccess 30 if i%2 == 1 { 31 v = resultError 32 } 33 values[i].result = v 34 } 35 36 if got := countLeadingReady(values); got != exp { 37 const fs = `size %d: expected %d, instead of %d` 38 t.Fatalf(fs, size, exp, got) 39 } 40 } 41 } 42 } 43 44 func TestStats(t *testing.T) { 45 var tests = []struct { 46 Input string 47 Expected stats 48 }{ 49 { 50 ``, 51 stats{}, 52 }, 53 { 54 `abc`, 55 stats{lines: 1, runes: 3, maxWidth: 3}, 56 }, 57 { 58 "abc\tdef\r\n", 59 stats{lines: 1, runes: 9, maxWidth: 7, tabs: 1, lf: 1, crlf: 1}, 60 }, 61 { 62 "abc\tdef\r\n", 63 stats{lines: 1, runes: 9, maxWidth: 7, tabs: 1, lf: 1, crlf: 1}, 64 }, 65 { 66 "abc\tdef \r\n123\t456 789 ", 67 stats{ 68 lines: 2, runes: 23, maxWidth: 13, 69 spaces: 4, tabs: 2, trailing: 2, lf: 1, crlf: 1, 70 }, 71 }, 72 } 73 74 for _, tc := range tests { 75 t.Run(tc.Input, func(t *testing.T) { 76 var got stats 77 err := got.updateStats(strings.NewReader(tc.Input)) 78 if err != nil { 79 t.Error(err) 80 return 81 } 82 83 tc.Expected.bytes = len(tc.Input) 84 tc.Expected.result = resultSuccess 85 if got != tc.Expected { 86 t.Fatalf("expected\n%#v,\ngot\n%#v", tc.Expected, got) 87 return 88 } 89 }) 90 } 91 }