File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath coby.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "errors" 37 "io" 38 "io/fs" 39 "os" 40 "path/filepath" 41 "runtime" 42 "strconv" 43 "sync" 44 ) 45 46 const info = ` 47 coby [options...] [files/folders...] 48 49 50 COunt BYtes finds out some simple byte-related stats, counting 51 52 - bytes 53 - lines 54 - how many lines have trailing spaces (trails) 55 - how many lines end with a CRLF pair 56 - all-bits-off (null) bytes 57 - all-bits-on (full) bytes 58 - top-bit-on (high) bytes 59 - which unicode byte-order-mark (bom) sequence the data start with 60 61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text 62 data, and thus may not be meaningful for general binary data. 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. A mix 69 of files/folders is supported for convenience. 70 71 The only option available is to show this help message, using any of 72 "-h", "--h", "-help", or "--help", without the quotes. 73 ` 74 75 // header has all the values for the first output line 76 var header = []string{ 77 `name`, 78 `bytes`, 79 `lines`, 80 `lf`, 81 `crlf`, 82 `spaces`, 83 `tabs`, 84 `trails`, 85 `nulls`, 86 `fulls`, 87 `highs`, 88 `bom`, 89 } 90 91 // event has what the output-reporting task needs to show the results of a 92 // task which has just completed, perhaps unsuccessfully 93 type event struct { 94 // Index points to the task's entry in the results-slice 95 Index int 96 97 // Stats has all the byte-related stats 98 Stats stats 99 100 // Err is the completed task's error, or lack of 101 Err error 102 } 103 104 func main() { 105 args := os.Args[1:] 106 107 if len(args) > 0 { 108 switch args[0] { 109 case `-h`, `--h`, `-help`, `--help`: 110 os.Stderr.WriteString(info[1:]) 111 return 112 113 case `--`: 114 args = args[1:] 115 } 116 } 117 118 // show first/heading line right away, to let users know things are 119 // happening 120 for i, s := range header { 121 if i > 0 { 122 os.Stdout.WriteString("\t") 123 } 124 os.Stdout.WriteString(s) 125 } 126 // assume an error means later stages/apps in a pipe had enough input and 127 // quit successfully, so quit successfully too 128 _, err := os.Stdout.WriteString("\n") 129 if err != nil { 130 return 131 } 132 133 // names has all filepaths given, ignoring repetitions 134 names, ok := findAllFiles(deduplicate(args)) 135 if !ok { 136 os.Exit(1) 137 } 138 if len(names) == 0 { 139 names = []string{`-`} 140 } 141 142 events := make(chan event) 143 go handleInputs(names, events) 144 if !handleOutput(os.Stdout, len(names), events) { 145 os.Exit(1) 146 } 147 } 148 149 type asyncArgs struct { 150 Results chan event 151 152 // Permissions limits how many worker tasks can be active at the same 153 // time: when given many filepaths to work on, rate-limiting avoids 154 // a massive number of concurrent tasks which read and process input 155 Permissions chan struct{} 156 157 // Tasks is to wait for all tasks to end before quitting the app 158 Tasks *sync.WaitGroup 159 } 160 161 // handleInputs launches all the tasks which do the actual work, limiting how 162 // many inputs are being worked on at the same time 163 func handleInputs(names []string, events chan event) { 164 var inputs sync.WaitGroup 165 // the number of tasks is always known in advance 166 inputs.Add(len(names)) 167 168 args := asyncArgs{ 169 Results: events, 170 Permissions: make(chan struct{}, runtime.NumCPU()), 171 Tasks: &inputs, 172 } 173 174 defer close(args.Results) // allow the output-reporter task to end 175 defer close(args.Permissions) 176 177 for i, name := range names { 178 // wait until some concurrency-room is available, before proceeding 179 args.Permissions <- struct{}{} 180 go handleInputAsync(i, name, args) 181 } 182 183 // wait for all inputs, before closing the `events` channel, which in turn 184 // would quit the whole app right away 185 args.Tasks.Wait() 186 } 187 188 // handleInputAsync is the dispatched func used in func handleInputs 189 func handleInputAsync(i int, name string, args asyncArgs) { 190 defer args.Tasks.Done() 191 defer func() { <-args.Permissions }() 192 res, err := handleInput(name) 193 args.Results <- event{Index: i, Stats: res, Err: err} 194 } 195 196 // handleInput handles each work-item for func handleInputs 197 func handleInput(path string) (stats, error) { 198 var res stats 199 res.name = path 200 201 if path == `-` { 202 err := res.updateStats(os.Stdin) 203 return res, err 204 } 205 206 f, err := os.Open(path) 207 if err != nil { 208 res.result = resultError 209 // on windows, file-not-found error messages may mention `CreateFile`, 210 // even when trying to open files in read-only mode 211 return res, errors.New(`can't open file named ` + path) 212 } 213 defer f.Close() 214 215 err = res.updateStats(f) 216 return res, err 217 } 218 219 // handleOutput asynchronously updates output as results are known, whether 220 // it's errors or successful results; returns whether it succeeded, which 221 // means no errors happened 222 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) { 223 ok = true 224 bw := bufio.NewWriter(w) 225 defer bw.Flush() 226 227 results := make([]stats, rescount) 228 229 // keep track of which tasks are over, so that on each event all leading 230 // results which are ready are shown: all of this ensures prompt output 231 // updates as soon as results come in, while keeping the original order 232 // of the names/filepaths given 233 resultsLeft := results 234 235 for v := range events { 236 results[v.Index] = v.Stats 237 if v.Err != nil { 238 ok = false 239 bw.Flush() 240 showError(v.Err) 241 242 // stay in the current loop, in case this failure was keeping 243 // previous successes from showing up 244 } 245 246 for len(resultsLeft) > 0 { 247 if resultsLeft[0].result == resultPending { 248 break 249 } 250 251 if err := showResult(bw, resultsLeft[0]); err != nil { 252 // assume later stages/apps in a pipe had enough input 253 return ok 254 } 255 resultsLeft = resultsLeft[1:] 256 } 257 258 // show leading results immediately, if any 259 bw.Flush() 260 } 261 262 return ok 263 } 264 265 func showError(err error) { 266 os.Stderr.WriteString(err.Error()) 267 os.Stderr.WriteString("\n") 268 } 269 270 // showResult does what it says 271 func showResult(w *bufio.Writer, s stats) error { 272 if s.result == resultError { 273 return nil 274 } 275 276 var buf [64]byte 277 w.WriteString(s.name) 278 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10)) 279 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10)) 280 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10)) 281 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10)) 282 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10)) 283 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10)) 284 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10)) 285 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10)) 286 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10)) 287 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10)) 288 w.WriteByte('\t') 289 w.WriteString(bomLegend[s.bom]) 290 return w.WriteByte('\n') 291 } 292 293 // deduplicate avoids repeating items, keeping the original slice unchanged 294 func deduplicate(src []string) []string { 295 var unique []string 296 got := make(map[string]struct{}) 297 298 for _, s := range src { 299 if _, ok := got[s]; ok { 300 continue 301 } 302 303 unique = append(unique, s) 304 got[s] = struct{}{} 305 } 306 307 return unique 308 } 309 310 // findAllFiles can be given a mix of file/folder paths, finding all files 311 // recursively in folders, avoiding duplicates 312 func findAllFiles(paths []string) (found []string, ok bool) { 313 res := make(chan any) 314 var all sync.WaitGroup 315 all.Add(1) 316 317 go func() { 318 defer all.Done() 319 got := make(map[string]struct{}) 320 ok = true 321 322 for v := range res { 323 if err, ok := v.(error); ok { 324 showError(err) 325 ok = false 326 continue 327 } 328 329 s, ok := v.(string) 330 if !ok { 331 showError(errors.New(`value is neither string nor error`)) 332 ok = false 333 continue 334 } 335 336 if _, ok := got[s]; ok { 337 continue 338 } 339 340 got[s] = struct{}{} 341 found = append(found, s) 342 } 343 }() 344 345 rec := func(path string, info fs.DirEntry, err error) error { 346 if err != nil { 347 res <- err 348 return err 349 } 350 351 if info.IsDir() { 352 return nil 353 } 354 355 res <- path 356 return nil 357 } 358 359 for _, s := range paths { 360 // a dash means standard input 361 if s == `-` { 362 res <- s 363 continue 364 } 365 366 info, err := os.Stat(s) 367 if os.IsNotExist(err) { 368 // on windows, file-not-found messages may mention `CreateFile`, 369 // even when trying to open files in read-only mode 370 res <- errors.New(`can't find file/folder named ` + s) 371 continue 372 } 373 374 if err != nil { 375 res <- err 376 continue 377 } 378 379 if !info.IsDir() { 380 res <- s 381 continue 382 } 383 384 if err := filepath.WalkDir(s, rec); err != nil { 385 res <- err 386 } 387 } 388 389 close(res) 390 all.Wait() 391 392 return found, ok 393 } 394 395 // counter makes it easy to change the int-size of almost all counters 396 type counter uint64 397 398 // statResult constrains possible result-states/values in type stats 399 type statResult int 400 401 const ( 402 // resultPending is the default not-yet-ready result-status 403 resultPending = statResult(0) 404 405 // resultError means result should show as an error, instead of data 406 resultError = statResult(1) 407 408 // resultSuccess means a result's stats are ready to show 409 resultSuccess = statResult(2) 410 ) 411 412 // bomType is the type for the byte-order-mark enumeration 413 type bomType int 414 415 const ( 416 noBOM = bomType(0) 417 utf8BOM = bomType(1) 418 utf16leBOM = bomType(2) 419 utf16beBOM = bomType(3) 420 utf32leBOM = bomType(4) 421 utf32beBOM = bomType(5) 422 ) 423 424 // bomLegend has the string-equivalents of the bomType constants 425 var bomLegend = []string{ 426 ``, 427 `UTF-8`, 428 `UTF-16 LE`, 429 `UTF-16 BE`, 430 `UTF-32 LE`, 431 `UTF-32 BE`, 432 } 433 434 // stats has all the size-stats for some input, as well as a way to 435 // skip showing results, in case of an error such as `file not found` 436 type stats struct { 437 // bytes counts all bytes read 438 bytes counter 439 440 // lines counts lines, and is 0 only when the byte-count is also 0 441 lines counter 442 443 // maxWidth is maximum byte-width of lines, excluding carriage-returns 444 // and/or line-feeds 445 maxWidth counter 446 447 // nulls counts all-bits-off bytes 448 nulls counter 449 450 // fulls counts all-bits-on bytes 451 fulls counter 452 453 // highs counts bytes with their `top` (highest-order) bit on 454 highs counter 455 456 // spaces counts ASCII spaces 457 spaces counter 458 459 // tabs counts ASCII tabs 460 tabs counter 461 462 // trailing counts lines with trailing spaces in them 463 trailing counter 464 465 // lf counts ASCII line-feeds as their own byte-values: this means its 466 // value will always be at least the same as field `crlf` 467 lf counter 468 469 // crlf counts ASCII CRLF byte-pairs 470 crlf counter 471 472 // the type of byte-order mark detected 473 bom bomType 474 475 // name is the filepath of the file/source these stats are about 476 name string 477 478 // results keeps track of whether results are valid and/or ready 479 result statResult 480 } 481 482 // updateStats does what it says, reading everything from a reader 483 func (res *stats) updateStats(r io.Reader) error { 484 err := res.updateUsing(r) 485 if err == io.EOF { 486 err = nil 487 } 488 489 if err == nil { 490 res.result = resultSuccess 491 } else { 492 res.result = resultError 493 } 494 return err 495 } 496 497 func checkBOM(data []byte) bomType { 498 d := data 499 l := len(data) 500 501 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 502 return utf8BOM 503 } 504 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 505 return utf32leBOM 506 } 507 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 508 return utf32beBOM 509 } 510 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 511 return utf16leBOM 512 } 513 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 514 return utf16beBOM 515 } 516 517 return noBOM 518 } 519 520 // updateUsing helps func updateStats do its job 521 func (res *stats) updateUsing(r io.Reader) error { 522 var buf [32 * 1024]byte 523 var tallies [256]uint64 524 525 var width counter 526 var prev1, prev2 byte 527 528 for { 529 n, err := r.Read(buf[:]) 530 if n < 1 { 531 res.lines = counter(tallies['\n']) 532 res.tabs = counter(tallies['\t']) 533 res.spaces = counter(tallies[' ']) 534 res.lf = counter(tallies['\n']) 535 res.nulls = counter(tallies[0]) 536 res.fulls = counter(tallies[255]) 537 for i := 128; i < 256; i++ { 538 res.highs += counter(tallies[i]) 539 } 540 541 if err == io.EOF { 542 return res.handleEnd(width, prev1, prev2) 543 } 544 return err 545 } 546 547 chunk := buf[:n] 548 if res.bytes == 0 { 549 res.bom = checkBOM(chunk) 550 } 551 res.bytes += counter(n) 552 553 for _, b := range chunk { 554 // count values without branching, because it's fun 555 tallies[b]++ 556 557 if b != '\n' { 558 prev2 = prev1 559 prev1 = b 560 width++ 561 continue 562 } 563 564 // handle line-feeds 565 566 crlf := count(prev1, '\r') 567 res.crlf += crlf 568 569 // count lines with trailing spaces, whether these end with 570 // a CRLF byte-pair or just a line-feed byte 571 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 572 res.trailing++ 573 } 574 575 // exclude any CR from the current line's width-count 576 width -= crlf 577 if res.maxWidth < width { 578 res.maxWidth = width 579 } 580 581 prev2 = prev1 582 prev1 = b 583 width = 0 584 } 585 } 586 } 587 588 // handleEnd fixes/finalizes stats when input data end; this func is only 589 // meant to be used by func updateStats, since it takes some of the latter's 590 // local variables 591 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 592 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 593 res.trailing++ 594 } 595 596 if res.maxWidth < width { 597 res.maxWidth = width 598 } 599 600 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 601 // standard cmd-line tool `wc` 602 if res.bytes > 0 && prev1 != '\n' { 603 res.lines++ 604 } 605 606 return nil 607 } 608 609 // count checks if 2 bytes are the same, returning either 0 or 1, which can 610 // be added directly/branchlessly to totals 611 func count(x, y byte) counter { 612 var c counter 613 if x == y { 614 c = 1 615 } else { 616 c = 0 617 } 618 return c 619 }