File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright (c) 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath coby.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "errors" 37 "io" 38 "io/fs" 39 "os" 40 "path/filepath" 41 "runtime" 42 "strconv" 43 "sync" 44 ) 45 46 const info = ` 47 coby [options...] [files/folders...] 48 49 50 COunt BYtes finds out some simple byte-related stats, counting 51 52 - bytes 53 - lines 54 - how many lines have trailing spaces (trails) 55 - how many lines end with a CRLF pair 56 - all-bits-off (null) bytes 57 - all-bits-on (full) bytes 58 - top-bit-on (high) bytes 59 - which unicode byte-order-mark (bom) sequence the data start with 60 61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text 62 data, and thus may not be meaningful for general binary data. 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. A mix 69 of files/folders is supported for convenience. 70 71 The only option available is to show this help message, using any of 72 "-h", "--h", "-help", or "--help", without the quotes. 73 ` 74 75 // header has all the values for the first output line 76 var header = []string{ 77 `name`, 78 `bytes`, 79 `lines`, 80 `lf`, 81 `crlf`, 82 `spaces`, 83 `tabs`, 84 `trails`, 85 `nulls`, 86 `fulls`, 87 `highs`, 88 `bom`, 89 } 90 91 // event has what the output-reporting task needs to show the results of a 92 // task which has just completed, perhaps unsuccessfully 93 type event struct { 94 // Index points to the task's entry in the results-slice 95 Index int 96 97 // Stats has all the byte-related stats 98 Stats stats 99 100 // Err is the completed task's error, or lack of 101 Err error 102 } 103 104 func main() { 105 args := os.Args[1:] 106 107 if len(args) > 0 { 108 switch args[0] { 109 case `-h`, `--h`, `-help`, `--help`: 110 os.Stdout.WriteString(info[1:]) 111 return 112 113 case `--`: 114 args = args[1:] 115 } 116 } 117 118 // show first/heading line right away, to let users know things are 119 // happening 120 for i, s := range header { 121 if i > 0 { 122 os.Stdout.WriteString("\t") 123 } 124 os.Stdout.WriteString(s) 125 } 126 // assume an error means later stages/apps in a pipe had enough input and 127 // quit successfully, so quit successfully too 128 _, err := os.Stdout.WriteString("\n") 129 if err != nil { 130 return 131 } 132 133 // names has all filepaths given, ignoring repetitions 134 names, ok := findAllFiles(args) 135 if !ok { 136 os.Exit(1) 137 return 138 } 139 if len(names) == 0 { 140 names = []string{`-`} 141 } 142 143 events := make(chan event) 144 go handleInputs(names, events) 145 if !handleOutput(os.Stdout, len(names), events) { 146 os.Exit(1) 147 return 148 } 149 } 150 151 // handleInputs launches all the tasks which do the actual work, limiting how 152 // many inputs are being worked on at the same time 153 func handleInputs(names []string, events chan<- event) { 154 defer close(events) // allow the output-reporter task to end 155 156 var tasks sync.WaitGroup 157 // the number of tasks is always known in advance 158 tasks.Add(len(names)) 159 160 // permissions is buffered to limit concurrency to the core-count 161 permissions := make(chan struct{}, runtime.NumCPU()) 162 defer close(permissions) 163 164 for i, name := range names { 165 // wait until some concurrency-room is available, before proceeding 166 permissions <- struct{}{} 167 168 go func(i int, name string) { 169 defer tasks.Done() 170 171 res, err := handleInput(name) 172 <-permissions 173 events <- event{Index: i, Stats: res, Err: err} 174 }(i, name) 175 } 176 177 // wait for all inputs, before closing the `events` channel, which in turn 178 // would quit the whole app right away 179 tasks.Wait() 180 } 181 182 // handleInput handles each work-item for func handleInputs 183 func handleInput(path string) (stats, error) { 184 var res stats 185 res.name = path 186 187 if path == `-` { 188 err := res.updateStats(os.Stdin) 189 return res, err 190 } 191 192 f, err := os.Open(path) 193 if err != nil { 194 res.result = resultError 195 // on windows, file-not-found error messages may mention `CreateFile`, 196 // even when trying to open files in read-only mode 197 return res, errors.New(`can't open file named ` + path) 198 } 199 defer f.Close() 200 201 err = res.updateStats(f) 202 return res, err 203 } 204 205 // handleOutput asynchronously updates output as results are known, whether 206 // it's errors or successful results; returns whether it succeeded, which 207 // means no errors happened 208 func handleOutput(w io.Writer, inputs int, events <-chan event) (ok bool) { 209 bw := bufio.NewWriter(w) 210 defer bw.Flush() 211 212 ok = true 213 results := make([]stats, inputs) 214 215 // keep track of which tasks are over, so that on each event all leading 216 // results which are ready are shown: all of this ensures prompt output 217 // updates as soon as results come in, while keeping the original order 218 // of the names/filepaths given 219 resultsLeft := results 220 221 for v := range events { 222 results[v.Index] = v.Stats 223 if v.Err != nil { 224 ok = false 225 bw.Flush() 226 showError(v.Err) 227 228 // stay in the current loop, in case this failure was keeping 229 // previous successes from showing up 230 } 231 232 for len(resultsLeft) > 0 { 233 if resultsLeft[0].result == resultPending { 234 break 235 } 236 237 if err := showResult(bw, resultsLeft[0]); err != nil { 238 // assume later stages/apps in a pipe had enough input 239 return ok 240 } 241 resultsLeft = resultsLeft[1:] 242 } 243 244 // show leading results immediately, if any 245 bw.Flush() 246 } 247 248 return ok 249 } 250 251 func showError(err error) { 252 os.Stderr.WriteString(err.Error()) 253 os.Stderr.WriteString("\n") 254 } 255 256 // showResult shows a TSV line for results marked as successful, doing nothing 257 // when given other types of results 258 func showResult(w *bufio.Writer, s stats) error { 259 if s.result != resultSuccess { 260 return nil 261 } 262 263 var buf [24]byte 264 w.WriteString(s.name) 265 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10)) 266 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10)) 267 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10)) 268 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10)) 269 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10)) 270 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10)) 271 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10)) 272 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10)) 273 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10)) 274 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10)) 275 w.WriteByte('\t') 276 w.WriteString(bomLegend[s.bom]) 277 return w.WriteByte('\n') 278 } 279 280 // findAllFiles can be given a mix of file/folder paths, finding all files 281 // recursively in folders, avoiding duplicates 282 func findAllFiles(paths []string) (files []string, success bool) { 283 walk := filepath.WalkDir 284 got := make(map[string]struct{}) 285 success = true 286 287 for _, path := range paths { 288 if _, ok := got[path]; ok { 289 continue 290 } 291 got[path] = struct{}{} 292 293 // a dash means standard input 294 if path == `-` { 295 files = append(files, path) 296 continue 297 } 298 299 info, err := os.Stat(path) 300 if os.IsNotExist(err) { 301 // on windows, file-not-found messages may mention `CreateFile`, 302 // even when trying to open files in read-only mode 303 err = errors.New(`can't find file/folder named ` + path) 304 } 305 306 if err != nil { 307 showError(err) 308 success = false 309 continue 310 } 311 312 if !info.IsDir() { 313 files = append(files, path) 314 continue 315 } 316 317 err = walk(path, func(path string, info fs.DirEntry, err error) error { 318 path, err = filepath.Abs(path) 319 if err != nil { 320 showError(err) 321 success = false 322 return err 323 } 324 325 if _, ok := got[path]; ok { 326 if info.IsDir() { 327 return fs.SkipDir 328 } 329 return nil 330 } 331 got[path] = struct{}{} 332 333 if err != nil { 334 showError(err) 335 success = false 336 return err 337 } 338 339 if info.IsDir() { 340 return nil 341 } 342 343 files = append(files, path) 344 return nil 345 }) 346 347 if err != nil { 348 showError(err) 349 success = false 350 } 351 } 352 353 return files, success 354 } 355 356 // counter makes it easy to change the int-size of almost all counters 357 type counter uint64 358 359 // statResult constrains possible result-states/values in type stats 360 type statResult int 361 362 const ( 363 // resultPending is the default not-yet-ready result-status 364 resultPending = statResult(0) 365 366 // resultError means result should show as an error, instead of data 367 resultError = statResult(1) 368 369 // resultSuccess means a result's stats are ready to show 370 resultSuccess = statResult(2) 371 ) 372 373 // bomType is the type for the byte-order-mark enumeration 374 type bomType int 375 376 const ( 377 noBOM = bomType(0) 378 utf8BOM = bomType(1) 379 utf16leBOM = bomType(2) 380 utf16beBOM = bomType(3) 381 utf32leBOM = bomType(4) 382 utf32beBOM = bomType(5) 383 ) 384 385 // bomLegend has the string-equivalents of the bomType constants 386 var bomLegend = []string{ 387 ``, 388 `UTF-8`, 389 `UTF-16 LE`, 390 `UTF-16 BE`, 391 `UTF-32 LE`, 392 `UTF-32 BE`, 393 } 394 395 // stats has all the size-stats for some input, as well as a way to 396 // skip showing results, in case of an error such as `file not found` 397 type stats struct { 398 // bytes counts all bytes read 399 bytes counter 400 401 // lines counts lines, and is 0 only when the byte-count is also 0 402 lines counter 403 404 // maxWidth is maximum byte-width of lines, excluding carriage-returns 405 // and/or line-feeds 406 maxWidth counter 407 408 // nulls counts all-bits-off bytes 409 nulls counter 410 411 // fulls counts all-bits-on bytes 412 fulls counter 413 414 // highs counts bytes with their `top` (highest-order) bit on 415 highs counter 416 417 // spaces counts ASCII spaces 418 spaces counter 419 420 // tabs counts ASCII tabs 421 tabs counter 422 423 // trailing counts lines with trailing spaces in them 424 trailing counter 425 426 // lf counts ASCII line-feeds as their own byte-values: this means its 427 // value will always be at least the same as field `crlf` 428 lf counter 429 430 // crlf counts ASCII CRLF byte-pairs 431 crlf counter 432 433 // the type of byte-order mark detected 434 bom bomType 435 436 // name is the filepath of the file/source these stats are about 437 name string 438 439 // results keeps track of whether results are valid and/or ready 440 result statResult 441 } 442 443 // updateStats does what it says, reading everything from a reader 444 func (res *stats) updateStats(r io.Reader) error { 445 err := res.updateUsing(r) 446 if err == io.EOF { 447 err = nil 448 } 449 450 if err == nil { 451 res.result = resultSuccess 452 } else { 453 res.result = resultError 454 } 455 return err 456 } 457 458 func checkBOM(data []byte) bomType { 459 d := data 460 l := len(data) 461 462 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 463 return utf8BOM 464 } 465 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 466 return utf32leBOM 467 } 468 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 469 return utf32beBOM 470 } 471 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 472 return utf16leBOM 473 } 474 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 475 return utf16beBOM 476 } 477 478 return noBOM 479 } 480 481 // updateUsing helps func updateStats do its job 482 func (res *stats) updateUsing(r io.Reader) error { 483 var buf [32 * 1024]byte 484 var tallies [256]uint64 485 486 var width counter 487 var prev1, prev2 byte 488 489 for { 490 n, err := r.Read(buf[:]) 491 if n < 1 { 492 res.lines = counter(tallies['\n']) 493 res.tabs = counter(tallies['\t']) 494 res.spaces = counter(tallies[' ']) 495 res.lf = counter(tallies['\n']) 496 res.nulls = counter(tallies[0]) 497 res.fulls = counter(tallies[255]) 498 for i := 128; i < len(tallies); i++ { 499 res.highs += counter(tallies[i]) 500 } 501 502 if err == io.EOF { 503 return res.handleEnd(width, prev1, prev2) 504 } 505 return err 506 } 507 508 chunk := buf[:n] 509 if res.bytes == 0 { 510 res.bom = checkBOM(chunk) 511 } 512 res.bytes += counter(n) 513 514 for _, b := range chunk { 515 // count values without branching, because it's fun 516 tallies[b]++ 517 518 if b != '\n' { 519 prev2 = prev1 520 prev1 = b 521 width++ 522 continue 523 } 524 525 // handle line-feeds 526 527 crlf := count(prev1, '\r') 528 res.crlf += crlf 529 530 // count lines with trailing spaces, whether these end with 531 // a CRLF byte-pair or just a line-feed byte 532 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 533 res.trailing++ 534 } 535 536 // exclude any CR from the current line's width-count 537 width -= crlf 538 if res.maxWidth < width { 539 res.maxWidth = width 540 } 541 542 prev2 = prev1 543 prev1 = b 544 width = 0 545 } 546 } 547 } 548 549 // handleEnd fixes/finalizes stats when input data end; this func is only 550 // meant to be used by func updateStats, since it takes some of the latter's 551 // local variables 552 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 553 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 554 res.trailing++ 555 } 556 557 if res.maxWidth < width { 558 res.maxWidth = width 559 } 560 561 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 562 // standard cmd-line tool `wc` 563 if res.bytes > 0 && prev1 != '\n' { 564 res.lines++ 565 } 566 567 return nil 568 } 569 570 // count checks if 2 bytes are the same, returning either 0 or 1, which can 571 // be added directly/branchlessly to totals 572 func count(x, y byte) counter { 573 var c counter 574 if x == y { 575 c = 1 576 } else { 577 c = 0 578 } 579 return c 580 }