File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright (c) 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath coby.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "errors" 37 "io" 38 "io/fs" 39 "os" 40 "path/filepath" 41 "runtime" 42 "strconv" 43 "sync" 44 ) 45 46 const info = ` 47 coby [options...] [files/folders...] 48 49 50 COunt BYtes finds out some simple byte-related stats, counting 51 52 - bytes 53 - lines 54 - how many lines have trailing spaces (trails) 55 - how many lines end with a CRLF pair 56 - all-bits-off (null) bytes 57 - all-bits-on (full) bytes 58 - top-bit-on (high) bytes 59 - which unicode byte-order-mark (bom) sequence the data start with 60 61 Some of these stats (lines, CRLFs, BOMs) only make sense for plain-text 62 data, and thus may not be meaningful for general binary data. 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. A mix 69 of files/folders is supported for convenience. 70 71 The only option available is to show this help message, using any of 72 "-h", "--h", "-help", or "--help", without the quotes. 73 ` 74 75 // header has all the values for the first output line 76 var header = []string{ 77 `name`, 78 `bytes`, 79 `lines`, 80 `lf`, 81 `crlf`, 82 `spaces`, 83 `tabs`, 84 `trails`, 85 `nulls`, 86 `fulls`, 87 `highs`, 88 `bom`, 89 } 90 91 // event has what the output-reporting task needs to show the results of a 92 // task which has just completed, perhaps unsuccessfully 93 type event struct { 94 // Index points to the task's entry in the results-slice 95 Index int 96 97 // Stats has all the byte-related stats 98 Stats stats 99 100 // Err is the completed task's error, or lack of 101 Err error 102 } 103 104 func main() { 105 args := os.Args[1:] 106 107 if len(args) > 0 { 108 switch args[0] { 109 case `-h`, `--h`, `-help`, `--help`: 110 os.Stdout.WriteString(info[1:]) 111 return 112 113 case `--`: 114 args = args[1:] 115 } 116 } 117 118 // show first/heading line right away, to let users know things are 119 // happening 120 for i, s := range header { 121 if i > 0 { 122 os.Stdout.WriteString("\t") 123 } 124 os.Stdout.WriteString(s) 125 } 126 // assume an error means later stages/apps in a pipe had enough input and 127 // quit successfully, so quit successfully too 128 _, err := os.Stdout.WriteString("\n") 129 if err != nil { 130 return 131 } 132 133 // names has all filepaths given, ignoring repetitions 134 names, ok := findAllFiles(args) 135 if !ok { 136 os.Exit(1) 137 } 138 if len(names) == 0 { 139 names = []string{`-`} 140 } 141 142 events := make(chan event) 143 go handleInputs(names, events) 144 if !handleOutput(os.Stdout, len(names), events) { 145 os.Exit(1) 146 } 147 } 148 149 // handleInputs launches all the tasks which do the actual work, limiting how 150 // many inputs are being worked on at the same time 151 func handleInputs(names []string, events chan<- event) { 152 defer close(events) // allow the output-reporter task to end 153 154 var tasks sync.WaitGroup 155 // the number of tasks is always known in advance 156 tasks.Add(len(names)) 157 158 // permissions is buffered to limit concurrency to the core-count 159 permissions := make(chan struct{}, runtime.NumCPU()) 160 defer close(permissions) 161 162 for i, name := range names { 163 // wait until some concurrency-room is available, before proceeding 164 permissions <- struct{}{} 165 166 go func(i int, name string) { 167 defer tasks.Done() 168 169 res, err := handleInput(name) 170 <-permissions 171 events <- event{Index: i, Stats: res, Err: err} 172 }(i, name) 173 } 174 175 // wait for all inputs, before closing the `events` channel, which in turn 176 // would quit the whole app right away 177 tasks.Wait() 178 } 179 180 // handleInput handles each work-item for func handleInputs 181 func handleInput(path string) (stats, error) { 182 var res stats 183 res.name = path 184 185 if path == `-` { 186 err := res.updateStats(os.Stdin) 187 return res, err 188 } 189 190 f, err := os.Open(path) 191 if err != nil { 192 res.result = resultError 193 // on windows, file-not-found error messages may mention `CreateFile`, 194 // even when trying to open files in read-only mode 195 return res, errors.New(`can't open file named ` + path) 196 } 197 defer f.Close() 198 199 err = res.updateStats(f) 200 return res, err 201 } 202 203 // handleOutput asynchronously updates output as results are known, whether 204 // it's errors or successful results; returns whether it succeeded, which 205 // means no errors happened 206 func handleOutput(w io.Writer, inputs int, events <-chan event) (ok bool) { 207 bw := bufio.NewWriter(w) 208 defer bw.Flush() 209 210 ok = true 211 results := make([]stats, inputs) 212 213 // keep track of which tasks are over, so that on each event all leading 214 // results which are ready are shown: all of this ensures prompt output 215 // updates as soon as results come in, while keeping the original order 216 // of the names/filepaths given 217 resultsLeft := results 218 219 for v := range events { 220 results[v.Index] = v.Stats 221 if v.Err != nil { 222 ok = false 223 bw.Flush() 224 showError(v.Err) 225 226 // stay in the current loop, in case this failure was keeping 227 // previous successes from showing up 228 } 229 230 for len(resultsLeft) > 0 { 231 if resultsLeft[0].result == resultPending { 232 break 233 } 234 235 if err := showResult(bw, resultsLeft[0]); err != nil { 236 // assume later stages/apps in a pipe had enough input 237 return ok 238 } 239 resultsLeft = resultsLeft[1:] 240 } 241 242 // show leading results immediately, if any 243 bw.Flush() 244 } 245 246 return ok 247 } 248 249 func showError(err error) { 250 os.Stderr.WriteString(err.Error()) 251 os.Stderr.WriteString("\n") 252 } 253 254 // showResult shows a TSV line for results marked as successful, doing nothing 255 // when given other types of results 256 func showResult(w *bufio.Writer, s stats) error { 257 if s.result != resultSuccess { 258 return nil 259 } 260 261 var buf [24]byte 262 w.WriteString(s.name) 263 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.bytes), 10)) 264 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lines), 10)) 265 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.lf), 10)) 266 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.crlf), 10)) 267 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.spaces), 10)) 268 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.tabs), 10)) 269 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.trailing), 10)) 270 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.nulls), 10)) 271 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.fulls), 10)) 272 w.Write(strconv.AppendUint(append(buf[:0], '\t'), uint64(s.highs), 10)) 273 w.WriteByte('\t') 274 w.WriteString(bomLegend[s.bom]) 275 return w.WriteByte('\n') 276 } 277 278 // findAllFiles can be given a mix of file/folder paths, finding all files 279 // recursively in folders, avoiding duplicates 280 func findAllFiles(paths []string) (files []string, success bool) { 281 walk := filepath.WalkDir 282 got := make(map[string]struct{}) 283 success = true 284 285 for _, path := range paths { 286 if _, ok := got[path]; ok { 287 continue 288 } 289 got[path] = struct{}{} 290 291 // a dash means standard input 292 if path == `-` { 293 files = append(files, path) 294 continue 295 } 296 297 info, err := os.Stat(path) 298 if os.IsNotExist(err) { 299 // on windows, file-not-found messages may mention `CreateFile`, 300 // even when trying to open files in read-only mode 301 err = errors.New(`can't find file/folder named ` + path) 302 } 303 304 if err != nil { 305 showError(err) 306 success = false 307 continue 308 } 309 310 if !info.IsDir() { 311 files = append(files, path) 312 continue 313 } 314 315 err = walk(path, func(path string, info fs.DirEntry, err error) error { 316 path, err = filepath.Abs(path) 317 if err != nil { 318 showError(err) 319 success = false 320 return err 321 } 322 323 if _, ok := got[path]; ok { 324 if info.IsDir() { 325 return fs.SkipDir 326 } 327 return nil 328 } 329 got[path] = struct{}{} 330 331 if err != nil { 332 showError(err) 333 success = false 334 return err 335 } 336 337 if info.IsDir() { 338 return nil 339 } 340 341 files = append(files, path) 342 return nil 343 }) 344 345 if err != nil { 346 showError(err) 347 success = false 348 } 349 } 350 351 return files, success 352 } 353 354 // counter makes it easy to change the int-size of almost all counters 355 type counter uint64 356 357 // statResult constrains possible result-states/values in type stats 358 type statResult int 359 360 const ( 361 // resultPending is the default not-yet-ready result-status 362 resultPending = statResult(0) 363 364 // resultError means result should show as an error, instead of data 365 resultError = statResult(1) 366 367 // resultSuccess means a result's stats are ready to show 368 resultSuccess = statResult(2) 369 ) 370 371 // bomType is the type for the byte-order-mark enumeration 372 type bomType int 373 374 const ( 375 noBOM = bomType(0) 376 utf8BOM = bomType(1) 377 utf16leBOM = bomType(2) 378 utf16beBOM = bomType(3) 379 utf32leBOM = bomType(4) 380 utf32beBOM = bomType(5) 381 ) 382 383 // bomLegend has the string-equivalents of the bomType constants 384 var bomLegend = []string{ 385 ``, 386 `UTF-8`, 387 `UTF-16 LE`, 388 `UTF-16 BE`, 389 `UTF-32 LE`, 390 `UTF-32 BE`, 391 } 392 393 // stats has all the size-stats for some input, as well as a way to 394 // skip showing results, in case of an error such as `file not found` 395 type stats struct { 396 // bytes counts all bytes read 397 bytes counter 398 399 // lines counts lines, and is 0 only when the byte-count is also 0 400 lines counter 401 402 // maxWidth is maximum byte-width of lines, excluding carriage-returns 403 // and/or line-feeds 404 maxWidth counter 405 406 // nulls counts all-bits-off bytes 407 nulls counter 408 409 // fulls counts all-bits-on bytes 410 fulls counter 411 412 // highs counts bytes with their `top` (highest-order) bit on 413 highs counter 414 415 // spaces counts ASCII spaces 416 spaces counter 417 418 // tabs counts ASCII tabs 419 tabs counter 420 421 // trailing counts lines with trailing spaces in them 422 trailing counter 423 424 // lf counts ASCII line-feeds as their own byte-values: this means its 425 // value will always be at least the same as field `crlf` 426 lf counter 427 428 // crlf counts ASCII CRLF byte-pairs 429 crlf counter 430 431 // the type of byte-order mark detected 432 bom bomType 433 434 // name is the filepath of the file/source these stats are about 435 name string 436 437 // results keeps track of whether results are valid and/or ready 438 result statResult 439 } 440 441 // updateStats does what it says, reading everything from a reader 442 func (res *stats) updateStats(r io.Reader) error { 443 err := res.updateUsing(r) 444 if err == io.EOF { 445 err = nil 446 } 447 448 if err == nil { 449 res.result = resultSuccess 450 } else { 451 res.result = resultError 452 } 453 return err 454 } 455 456 func checkBOM(data []byte) bomType { 457 d := data 458 l := len(data) 459 460 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 461 return utf8BOM 462 } 463 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 464 return utf32leBOM 465 } 466 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 467 return utf32beBOM 468 } 469 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 470 return utf16leBOM 471 } 472 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 473 return utf16beBOM 474 } 475 476 return noBOM 477 } 478 479 // updateUsing helps func updateStats do its job 480 func (res *stats) updateUsing(r io.Reader) error { 481 var buf [32 * 1024]byte 482 var tallies [256]uint64 483 484 var width counter 485 var prev1, prev2 byte 486 487 for { 488 n, err := r.Read(buf[:]) 489 if n < 1 { 490 res.lines = counter(tallies['\n']) 491 res.tabs = counter(tallies['\t']) 492 res.spaces = counter(tallies[' ']) 493 res.lf = counter(tallies['\n']) 494 res.nulls = counter(tallies[0]) 495 res.fulls = counter(tallies[255]) 496 for i := 128; i < len(tallies); i++ { 497 res.highs += counter(tallies[i]) 498 } 499 500 if err == io.EOF { 501 return res.handleEnd(width, prev1, prev2) 502 } 503 return err 504 } 505 506 chunk := buf[:n] 507 if res.bytes == 0 { 508 res.bom = checkBOM(chunk) 509 } 510 res.bytes += counter(n) 511 512 for _, b := range chunk { 513 // count values without branching, because it's fun 514 tallies[b]++ 515 516 if b != '\n' { 517 prev2 = prev1 518 prev1 = b 519 width++ 520 continue 521 } 522 523 // handle line-feeds 524 525 crlf := count(prev1, '\r') 526 res.crlf += crlf 527 528 // count lines with trailing spaces, whether these end with 529 // a CRLF byte-pair or just a line-feed byte 530 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 531 res.trailing++ 532 } 533 534 // exclude any CR from the current line's width-count 535 width -= crlf 536 if res.maxWidth < width { 537 res.maxWidth = width 538 } 539 540 prev2 = prev1 541 prev1 = b 542 width = 0 543 } 544 } 545 } 546 547 // handleEnd fixes/finalizes stats when input data end; this func is only 548 // meant to be used by func updateStats, since it takes some of the latter's 549 // local variables 550 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 551 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 552 res.trailing++ 553 } 554 555 if res.maxWidth < width { 556 res.maxWidth = width 557 } 558 559 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 560 // standard cmd-line tool `wc` 561 if res.bytes > 0 && prev1 != '\n' { 562 res.lines++ 563 } 564 565 return nil 566 } 567 568 // count checks if 2 bytes are the same, returning either 0 or 1, which can 569 // be added directly/branchlessly to totals 570 func count(x, y byte) counter { 571 var c counter 572 if x == y { 573 c = 1 574 } else { 575 c = 0 576 } 577 return c 578 }