/* The MIT License (MIT) Copyright © 2024 pacman64 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Single-file source-code for coby: this version has no http(s) support. Even the unit-tests from the original coby are omitted. To compile a smaller-sized command-line app, you can use the `go` command as follows: go build -ldflags "-s -w" -trimpath coby.go */ package main import ( "bufio" "errors" "io" "io/fs" "os" "path/filepath" "runtime" "strconv" "sync" ) const info = ` coby [files/folders...] COunt BYtes finds out some simple byte-related stats, counting - bytes - lines - how many lines have trailing spaces - how many lines end with a CRLF pair - all-off (0) bytes - all-on (255) bytes - high-bytes (128+) - which (if any) byte-order mark the data start with The output is TSV (tab-separated values) lines, where the first line has all the column names. When no filepaths are given, the standard input is used by default. All folder names given expand recursively into all filenames in them. ` // header is the first output line var header = []string{ `name`, `bytes`, `runes`, `lines`, `lf`, `crlf`, `spaces`, `tabs`, `trails`, `nulls`, `fulls`, `highs`, `bom`, } // event has what the output-reporting task needs to show the results of a // task which has just completed, perhaps unsuccessfully type event struct { // Index points to the task's entry in the results-slice Index int // Stats has all the byte-related stats Stats stats // Err is the completed task's error, or lack of Err error } func main() { if len(os.Args) > 1 { switch os.Args[1] { case `-h`, `--h`, `-help`, `--help`: os.Stderr.WriteString(info[1:]) return } } // show first/heading line right away, to let users know things are // happening for i, s := range header { if i > 0 { os.Stdout.WriteString("\t") } os.Stdout.WriteString(s) } // assume an error means later stages/apps in a pipe had enough input and // quit successfully, so quit successfully too _, err := os.Stdout.WriteString("\n") if err != nil { return } // names has all filepaths given, ignoring repetitions names, ok := findAllFiles(unique(os.Args[1:])) if !ok { os.Exit(1) } if len(names) == 0 { names = []string{`-`} } events := make(chan event) go handleInputs(names, events) if !handleOutput(os.Stdout, len(names), events) { os.Exit(1) } } // handleInputs launches all the tasks which do the actual work, limiting how // many inputs are being worked on at the same time func handleInputs(names []string, events chan event) { // allow output-reporter task to end, and thus the app defer close(events) // permissions limits how many worker tasks can be active at the same // time: when given many filepaths to work on, rate-limiting avoids // a massive number of concurrent tasks which read and process input permissions := make(chan struct{}, runtime.NumCPU()) defer close(permissions) var inputs sync.WaitGroup for i := range names { // wait until some concurrency-room is available permissions <- struct{}{} inputs.Add(1) go func(i int) { defer inputs.Done() res, err := handleInput(names[i]) events <- event{i, res, err} <-permissions }(i) } // wait for all inputs, before closing the `events` channel inputs.Wait() } // handleInput handles each work-item for func handleInputs func handleInput(path string) (stats, error) { var res stats res.name = path if path == `-` { err := res.updateStats(os.Stdin) return res, err } f, err := os.Open(path) if err != nil { res.result = resultError // on windows, file-not-found error messages may mention `CreateFile`, // even when trying to open files in read-only mode return res, errors.New(`can't open file named ` + path) } defer f.Close() err = res.updateStats(f) return res, err } // handleOutput asynchronously updates output as results are known, whether // it's errors or successful results; returns whether it succeeded, which // means no errors happened func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) { ok = true bw := bufio.NewWriter(w) defer bw.Flush() results := make([]stats, rescount) // keep track of which tasks are over, so that on each event all leading // results which are ready are shown: all of this ensures prompt output // updates as soon as results come in, while keeping the original order // of the names/filepaths given resultsLeft := results for v := range events { results[v.Index] = v.Stats if v.Err != nil { ok = false bw.Flush() showError(v.Err) // stay in the current loop, in case this failure was keeping // previous successes from showing up } n := countLeadingReady(resultsLeft) for _, res := range resultsLeft[:n] { if err := showResult(bw, res); err != nil { // assume later stages/apps in a pipe had enough input and // quit successfully, so quit successfully too return true } } resultsLeft = resultsLeft[n:] // flush output-buffer only if anything new was shown if n > 0 { bw.Flush() } } return ok } // showError standardizes how errors from this app look func showError(err error) { os.Stderr.WriteString("\x1b[31m") os.Stderr.WriteString(err.Error()) os.Stderr.WriteString("\x1b[0m\n") } // showResult does what it says func showResult(w *bufio.Writer, res stats) error { if res.result == resultError { return nil } var buf [64]byte w.WriteString(res.name) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10)) w.Write([]byte{'\t'}) w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10)) w.Write([]byte{'\t'}) w.WriteString(bomLegend[res.bom]) _, err := w.Write([]byte{'\n'}) return err } // unique ensures items only appear once in the result, keeping the original // slice unchanged func unique(src []string) []string { var unique []string got := make(map[string]struct{}) for _, s := range src { if _, ok := got[s]; ok { continue } unique = append(unique, s) got[s] = struct{}{} } return unique } // findAllFiles does what it says, given a mix of file/folder paths, finding // all files recursively in the case of folders func findAllFiles(paths []string) (found []string, ok bool) { var unique []string got := make(map[string]struct{}) ok = true for _, root := range paths { // a dash means standard input if root == `-` { if _, ok := got[root]; ok { continue } unique = append(unique, root) got[root] = struct{}{} continue } _, err := os.Stat(root) if os.IsNotExist(err) { ok = false // on windows, file-not-found error messages may mention `CreateFile`, // even when trying to open files in read-only mode err := errors.New(`can't find file/folder named ` + root) showError(err) continue } err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } if d.IsDir() { return nil } if _, ok := got[path]; ok { return nil } unique = append(unique, path) got[path] = struct{}{} return nil }) if err != nil { ok = false showError(err) } } return unique, ok } // isZero enables branchless-counting, when xor-compared bytes are used // as indices for it var isZero = [256]byte{1} // counter makes it easy to change the int-size of almost all counters type counter int // statResult constrains possible result-states/values in type stats type statResult int const ( // resultPending is the default not-yet-ready result-status resultPending = statResult(0) // resultError signals result should show as an error, instead of data resultError = statResult(1) // resultSuccess means result can be shown resultSuccess = statResult(2) ) type bomType int const ( noBOM = bomType(0) utf8BOM = bomType(1) utf16leBOM = bomType(2) utf16beBOM = bomType(3) utf32leBOM = bomType(4) utf32beBOM = bomType(5) ) var bomLegend = []string{ ``, `UTF-8`, `UTF-16 LE`, `UTF-16 BE`, `UTF-32 LE`, `UTF-32 BE`, } // stats has all the size-stats for some input, as well as a way to // skip showing results, in case of an error such as `file not found` type stats struct { // bytes counts all bytes read bytes int // lines counts lines, and is 0 only when the byte-count is also 0 lines counter // runes counts utf-8 sequences, each of which can use up to 4 bytes and // is usually a complete symbol: `emoji` country-flags are commonly-used // counter-examples, as these `symbols` need 2 runes, using 8 bytes each runes counter // maxWidth is maximum byte-width of lines, excluding carriage-returns // and/or line-feeds maxWidth counter // nulls counts all-bits-off bytes nulls counter // fulls counts all-bits-on bytes fulls counter // highs counts bytes with their `top` (highest-order) bit on highs counter // spaces counts ASCII spaces spaces counter // tabs counts ASCII tabs tabs counter // trailing counts lines with trailing spaces in them trailing counter // lf counts ASCII line-feeds as their own byte-values: this means its // value will always be at least the same as field `crlf` lf counter // crlf counts ASCII CRLF byte-pairs crlf counter // the type of byte-order mark detected bom bomType // name is the filepath of the file/source these stats are about name string // results keeps track of whether results are valid and/or ready result statResult } // updateStats does what it says, reading everything from a reader func (res *stats) updateStats(r io.Reader) error { err := res.updateUsing(r) if err == io.EOF { err = nil } if err == nil { res.result = resultSuccess } else { res.result = resultError } return err } func checkBOM(data []byte) bomType { d := data l := len(data) if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { return utf8BOM } if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { return utf32leBOM } if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { return utf32beBOM } if l >= 2 && data[0] == 0xff && data[1] == 0xfe { return utf16leBOM } if l >= 2 && data[0] == 0xfe && data[1] == 0xff { return utf16beBOM } return noBOM } // updateUsing helps func updateStats do its job func (res *stats) updateUsing(r io.Reader) error { var width counter var prev1, prev2 byte var buf [16 * 1024]byte var tallies [256]uint64 for { n, err := r.Read(buf[:]) if n < 1 { if err == io.EOF { res.lines = counter(tallies['\n']) res.tabs = counter(tallies['\t']) res.spaces = counter(tallies[' ']) res.lf = counter(tallies['\n']) res.nulls = counter(tallies[0]) res.fulls = counter(tallies[255]) for i := 128; i < 256; i++ { res.highs += counter(tallies[i]) } return res.handleEnd(width, prev1, prev2) } return err } chunk := buf[:n] if res.bytes == 0 { res.bom = checkBOM(chunk) } res.bytes += n for _, b := range chunk { // count values without branching, because it's fun tallies[b]++ // handle non-ASCII runes, assuming input is valid UTF-8 res.runes += 1 - count(b&0xc0, 0x80) // handle line-feeds if b == '\n' { crlf := count(prev1, '\r') res.crlf += crlf // count lines with trailing spaces, whether these end with // a CRLF byte-pair or just a line-feed byte if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { res.trailing++ } // exclude any CR from the current line's width-count width -= crlf if res.maxWidth < width { res.maxWidth = width } prev2 = prev1 prev1 = b width = 0 continue } prev2 = prev1 prev1 = b width++ } } } // handleEnd fixes/finalizes stats when input data end; this func is only // meant to be used by func updateStats, since it takes some of the latter's // local variables func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { res.trailing++ } if res.maxWidth < width { res.maxWidth = width } // avoid reporting 0 lines with a non-0 byte-count: this is unlike the // standard cmd-line tool `wc` if res.bytes > 0 && prev1 != '\n' { res.lines++ } return nil } // count checks if 2 bytes are the same, returning either 0 or 1, which can // be added directly/branchlessly to totals // func count(x, y byte) counter { // return counter(isZero[x^y]) // } // count checks if 2 bytes are the same, returning either 0 or 1, which can // be added directly/branchlessly to totals func count(x, y byte) counter { if x != y { return 0 } return 1 } // countLeadingReady finds how many items are ready to show at the start of a // results-slice, which ensures output matches the original item-order func countLeadingReady(values []stats) int { for i, v := range values { if v.result == resultPending { return i } } return len(values) }