File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright (c) 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath coby.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "errors" 37 "io" 38 "io/fs" 39 "os" 40 "path/filepath" 41 "runtime" 42 "strconv" 43 "sync" 44 ) 45 46 const info = ` 47 coby [options...] [files/folders...] 48 49 50 COunt BYtes finds out some simple byte-related stats, counting 51 52 - bytes 53 - lines 54 - how many lines have trailing spaces (trails) 55 - how many lines end with a CRLF pair 56 - all-bits-off (null) bytes 57 - all-bits-on (full) bytes 58 - top-bit-on (high) bytes 59 - which unicode byte-order-mark (bom) sequence the data start with 60 61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text 62 data, and thus may not be meaningful for general binary data. 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. A mix 69 of files/folders is supported for convenience. 70 71 The only option available is to show this help message, using any of 72 "-h", "--h", "-help", or "--help", without the quotes. 73 ` 74 75 // header has all the values for the first output line 76 var header = []string{ 77 `name`, 78 `bytes`, 79 `lines`, 80 `lf`, 81 `crlf`, 82 `spaces`, 83 `tabs`, 84 `trails`, 85 `nulls`, 86 `fulls`, 87 `highs`, 88 `bom`, 89 } 90 91 // event has what the output-reporting task needs to show the results of a 92 // task which has just completed, perhaps unsuccessfully 93 type event struct { 94 // Index points to the task's entry in the results-slice 95 Index int 96 97 // Stats has all the byte-related stats 98 Stats stats 99 100 // Err is the completed task's error, or lack of 101 Err error 102 } 103 104 func main() { 105 args := os.Args[1:] 106 107 if len(args) > 0 { 108 switch args[0] { 109 case `-h`, `--h`, `-help`, `--help`: 110 os.Stdout.WriteString(info[1:]) 111 return 112 113 case `--`: 114 args = args[1:] 115 } 116 } 117 118 // show first/heading line right away, to let users know things are 119 // happening 120 for i, s := range header { 121 if i > 0 { 122 os.Stdout.WriteString("\t") 123 } 124 os.Stdout.WriteString(s) 125 } 126 // assume an error means later stages/apps in a pipe had enough input and 127 // quit successfully, so quit successfully too 128 _, err := os.Stdout.WriteString("\n") 129 if err != nil { 130 return 131 } 132 133 // names has all filepaths given, ignoring repetitions 134 names, ok := findAllFiles(args) 135 if !ok { 136 os.Exit(1) 137 } 138 if len(names) == 0 { 139 names = []string{`-`} 140 } 141 142 events := make(chan event) 143 go handleInputs(names, events) 144 if !handleOutput(os.Stdout, len(names), events) { 145 os.Exit(1) 146 } 147 } 148 149 // handleInputs launches all the tasks which do the actual work, limiting how 150 // many inputs are being worked on at the same time 151 func handleInputs(names []string, events chan<- event) { 152 defer close(events) // allow the output-reporter task to end 153 154 var tasks sync.WaitGroup 155 // the number of tasks is always known in advance 156 tasks.Add(len(names)) 157 158 // permissions is buffered to limit concurrency to the core-count 159 permissions := make(chan struct{}, runtime.NumCPU()) 160 defer close(permissions) 161 162 for i, name := range names { 163 // wait until some concurrency-room is available, before proceeding 164 permissions <- struct{}{} 165 166 go func(i int, name string) { 167 defer tasks.Done() 168 169 res, err := handleInput(name) 170 <-permissions 171 events <- event{Index: i, Stats: res, Err: err} 172 }(i, name) 173 } 174 175 // wait for all inputs, before closing the `events` channel, which in turn 176 // would quit the whole app right away 177 tasks.Wait() 178 } 179 180 // handleInput handles each work-item for func handleInputs 181 func handleInput(path string) (stats, error) { 182 var res stats 183 res.name = path 184 185 if path == `-` { 186 err := res.updateStats(os.Stdin) 187 return res, err 188 } 189 190 f, err := os.Open(path) 191 if err != nil { 192 res.result = resultError 193 // on windows, file-not-found error messages may mention `CreateFile`, 194 // even when trying to open files in read-only mode 195 return res, errors.New(`can't open file named ` + path) 196 } 197 defer f.Close() 198 199 err = res.updateStats(f) 200 return res, err 201 } 202 203 // handleOutput asynchronously updates output as results are known, whether 204 // it's errors or successful results; returns whether it succeeded, which 205 // means no errors happened 206 func handleOutput(w io.Writer, inputs int, events <-chan event) (ok bool) { 207 bw := bufio.NewWriter(w) 208 defer bw.Flush() 209 210 ok = true 211 results := make([]stats, inputs) 212 213 // keep track of which tasks are over, so that on each event all leading 214 // results which are ready are shown: all of this ensures prompt output 215 // updates as soon as results come in, while keeping the original order 216 // of the names/filepaths given 217 resultsLeft := results 218 219 for v := range events { 220 results[v.Index] = v.Stats 221 if v.Err != nil { 222 ok = false 223 bw.Flush() 224 showError(v.Err) 225 226 // stay in the current loop, in case this failure was keeping 227 // previous successes from showing up 228 } 229 230 for len(resultsLeft) > 0 { 231 if resultsLeft[0].result == resultPending { 232 break 233 } 234 235 if err := showResult(bw, resultsLeft[0]); err != nil { 236 // assume later stages/apps in a pipe had enough input 237 return ok 238 } 239 resultsLeft = resultsLeft[1:] 240 } 241 242 // show leading results immediately, if any 243 bw.Flush() 244 } 245 246 return ok 247 } 248 249 func showError(err error) { 250 os.Stderr.WriteString(err.Error()) 251 os.Stderr.WriteString("\n") 252 } 253 254 // showResult shows a TSV line for results marked as successful, doing nothing 255 // when given other types of results 256 func showResult(w *bufio.Writer, s stats) error { 257 if s.result != resultSuccess { 258 return nil 259 } 260 261 var buf [24]byte 262 w.WriteString(s.name) 263 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10)) 264 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10)) 265 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10)) 266 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10)) 267 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10)) 268 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10)) 269 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10)) 270 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10)) 271 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10)) 272 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10)) 273 w.WriteByte('\t') 274 w.WriteString(bomLegend[s.bom]) 275 return w.WriteByte('\n') 276 } 277 278 // findAllFiles can be given a mix of file/folder paths, finding all files 279 // recursively in folders, avoiding duplicates 280 func findAllFiles(paths []string) (files []string, success bool) { 281 rec := filepath.WalkDir 282 got := make(map[string]struct{}) 283 success = true 284 285 for _, path := range paths { 286 if _, ok := got[path]; ok { 287 continue 288 } 289 got[path] = struct{}{} 290 291 // a dash means standard input 292 if path == `-` { 293 files = append(files, path) 294 continue 295 } 296 297 info, err := os.Stat(path) 298 if os.IsNotExist(err) { 299 // on windows, file-not-found messages may mention `CreateFile`, 300 // even when trying to open files in read-only mode 301 err = errors.New(`can't find file/folder named ` + path) 302 } 303 304 if err != nil { 305 showError(err) 306 success = false 307 continue 308 } 309 310 if !info.IsDir() { 311 files = append(files, path) 312 continue 313 } 314 315 err = rec(path, func(path string, info fs.DirEntry, err error) error { 316 if _, ok := got[path]; ok { 317 if info.IsDir() { 318 return fs.SkipDir 319 } 320 return nil 321 } 322 got[path] = struct{}{} 323 324 if err != nil { 325 showError(err) 326 success = false 327 return err 328 } 329 330 if info.IsDir() { 331 return nil 332 } 333 334 files = append(files, path) 335 return nil 336 }) 337 338 if err != nil { 339 showError(err) 340 success = false 341 } 342 } 343 344 return files, success 345 } 346 347 // counter makes it easy to change the int-size of almost all counters 348 type counter uint64 349 350 // statResult constrains possible result-states/values in type stats 351 type statResult int 352 353 const ( 354 // resultPending is the default not-yet-ready result-status 355 resultPending = statResult(0) 356 357 // resultError means result should show as an error, instead of data 358 resultError = statResult(1) 359 360 // resultSuccess means a result's stats are ready to show 361 resultSuccess = statResult(2) 362 ) 363 364 // bomType is the type for the byte-order-mark enumeration 365 type bomType int 366 367 const ( 368 noBOM = bomType(0) 369 utf8BOM = bomType(1) 370 utf16leBOM = bomType(2) 371 utf16beBOM = bomType(3) 372 utf32leBOM = bomType(4) 373 utf32beBOM = bomType(5) 374 ) 375 376 // bomLegend has the string-equivalents of the bomType constants 377 var bomLegend = []string{ 378 ``, 379 `UTF-8`, 380 `UTF-16 LE`, 381 `UTF-16 BE`, 382 `UTF-32 LE`, 383 `UTF-32 BE`, 384 } 385 386 // stats has all the size-stats for some input, as well as a way to 387 // skip showing results, in case of an error such as `file not found` 388 type stats struct { 389 // bytes counts all bytes read 390 bytes counter 391 392 // lines counts lines, and is 0 only when the byte-count is also 0 393 lines counter 394 395 // maxWidth is maximum byte-width of lines, excluding carriage-returns 396 // and/or line-feeds 397 maxWidth counter 398 399 // nulls counts all-bits-off bytes 400 nulls counter 401 402 // fulls counts all-bits-on bytes 403 fulls counter 404 405 // highs counts bytes with their `top` (highest-order) bit on 406 highs counter 407 408 // spaces counts ASCII spaces 409 spaces counter 410 411 // tabs counts ASCII tabs 412 tabs counter 413 414 // trailing counts lines with trailing spaces in them 415 trailing counter 416 417 // lf counts ASCII line-feeds as their own byte-values: this means its 418 // value will always be at least the same as field `crlf` 419 lf counter 420 421 // crlf counts ASCII CRLF byte-pairs 422 crlf counter 423 424 // the type of byte-order mark detected 425 bom bomType 426 427 // name is the filepath of the file/source these stats are about 428 name string 429 430 // results keeps track of whether results are valid and/or ready 431 result statResult 432 } 433 434 // updateStats does what it says, reading everything from a reader 435 func (res *stats) updateStats(r io.Reader) error { 436 err := res.updateUsing(r) 437 if err == io.EOF { 438 err = nil 439 } 440 441 if err == nil { 442 res.result = resultSuccess 443 } else { 444 res.result = resultError 445 } 446 return err 447 } 448 449 func checkBOM(data []byte) bomType { 450 d := data 451 l := len(data) 452 453 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 454 return utf8BOM 455 } 456 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 457 return utf32leBOM 458 } 459 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 460 return utf32beBOM 461 } 462 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 463 return utf16leBOM 464 } 465 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 466 return utf16beBOM 467 } 468 469 return noBOM 470 } 471 472 // updateUsing helps func updateStats do its job 473 func (res *stats) updateUsing(r io.Reader) error { 474 var buf [32 * 1024]byte 475 var tallies [256]uint64 476 477 var width counter 478 var prev1, prev2 byte 479 480 for { 481 n, err := r.Read(buf[:]) 482 if n < 1 { 483 res.lines = counter(tallies['\n']) 484 res.tabs = counter(tallies['\t']) 485 res.spaces = counter(tallies[' ']) 486 res.lf = counter(tallies['\n']) 487 res.nulls = counter(tallies[0]) 488 res.fulls = counter(tallies[255]) 489 for i := 128; i < len(tallies); i++ { 490 res.highs += counter(tallies[i]) 491 } 492 493 if err == io.EOF { 494 return res.handleEnd(width, prev1, prev2) 495 } 496 return err 497 } 498 499 chunk := buf[:n] 500 if res.bytes == 0 { 501 res.bom = checkBOM(chunk) 502 } 503 res.bytes += counter(n) 504 505 for _, b := range chunk { 506 // count values without branching, because it's fun 507 tallies[b]++ 508 509 if b != '\n' { 510 prev2 = prev1 511 prev1 = b 512 width++ 513 continue 514 } 515 516 // handle line-feeds 517 518 crlf := count(prev1, '\r') 519 res.crlf += crlf 520 521 // count lines with trailing spaces, whether these end with 522 // a CRLF byte-pair or just a line-feed byte 523 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 524 res.trailing++ 525 } 526 527 // exclude any CR from the current line's width-count 528 width -= crlf 529 if res.maxWidth < width { 530 res.maxWidth = width 531 } 532 533 prev2 = prev1 534 prev1 = b 535 width = 0 536 } 537 } 538 } 539 540 // handleEnd fixes/finalizes stats when input data end; this func is only 541 // meant to be used by func updateStats, since it takes some of the latter's 542 // local variables 543 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 544 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 545 res.trailing++ 546 } 547 548 if res.maxWidth < width { 549 res.maxWidth = width 550 } 551 552 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 553 // standard cmd-line tool `wc` 554 if res.bytes > 0 && prev1 != '\n' { 555 res.lines++ 556 } 557 558 return nil 559 } 560 561 // count checks if 2 bytes are the same, returning either 0 or 1, which can 562 // be added directly/branchlessly to totals 563 func count(x, y byte) counter { 564 var c counter 565 if x == y { 566 c = 1 567 } else { 568 c = 0 569 } 570 return c 571 }