File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 Single-file source-code for coby: this version has no http(s) support. Even 27 the unit-tests from the original coby are omitted. 28 29 To compile a smaller-sized command-line app, you can use the `go` command as 30 follows: 31 32 go build -ldflags "-s -w" -trimpath coby.go 33 */ 34 35 package main 36 37 import ( 38 "bufio" 39 "errors" 40 "io" 41 "io/fs" 42 "os" 43 "path/filepath" 44 "runtime" 45 "strconv" 46 "sync" 47 ) 48 49 const info = ` 50 coby [files/folders...] 51 52 53 COunt BYtes finds out some simple byte-related stats, counting 54 55 - bytes 56 - lines 57 - how many lines have trailing spaces 58 - how many lines end with a CRLF pair 59 - all-off (0) bytes 60 - all-on (255) bytes 61 - high-bytes (128+) 62 - which (if any) byte-order mark the data start with 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. 69 ` 70 71 // header is the first output line 72 var header = []string{ 73 `name`, 74 `bytes`, 75 `lines`, 76 `lf`, 77 `crlf`, 78 `spaces`, 79 `tabs`, 80 `trails`, 81 `nulls`, 82 `fulls`, 83 `highs`, 84 `bom`, 85 } 86 87 // event has what the output-reporting task needs to show the results of a 88 // task which has just completed, perhaps unsuccessfully 89 type event struct { 90 // Index points to the task's entry in the results-slice 91 Index int 92 93 // Stats has all the byte-related stats 94 Stats stats 95 96 // Err is the completed task's error, or lack of 97 Err error 98 } 99 100 func main() { 101 if len(os.Args) > 1 { 102 switch os.Args[1] { 103 case `-h`, `--h`, `-help`, `--help`: 104 os.Stderr.WriteString(info[1:]) 105 return 106 } 107 } 108 109 // show first/heading line right away, to let users know things are 110 // happening 111 for i, s := range header { 112 if i > 0 { 113 os.Stdout.WriteString("\t") 114 } 115 os.Stdout.WriteString(s) 116 } 117 // assume an error means later stages/apps in a pipe had enough input and 118 // quit successfully, so quit successfully too 119 _, err := os.Stdout.WriteString("\n") 120 if err != nil { 121 return 122 } 123 124 // names has all filepaths given, ignoring repetitions 125 names, ok := findAllFiles(unique(os.Args[1:])) 126 if !ok { 127 os.Exit(1) 128 } 129 if len(names) == 0 { 130 names = []string{`-`} 131 } 132 133 events := make(chan event) 134 go handleInputs(names, events) 135 if !handleOutput(os.Stdout, len(names), events) { 136 os.Exit(1) 137 } 138 } 139 140 // handleInputs launches all the tasks which do the actual work, limiting how 141 // many inputs are being worked on at the same time 142 func handleInputs(names []string, events chan event) { 143 // allow output-reporter task to end, and thus the app 144 defer close(events) 145 146 // permissions limits how many worker tasks can be active at the same 147 // time: when given many filepaths to work on, rate-limiting avoids 148 // a massive number of concurrent tasks which read and process input 149 permissions := make(chan struct{}, runtime.NumCPU()) 150 defer close(permissions) 151 152 var inputs sync.WaitGroup 153 for i := range names { 154 // wait until some concurrency-room is available 155 permissions <- struct{}{} 156 inputs.Add(1) 157 158 go func(i int) { 159 defer inputs.Done() 160 res, err := handleInput(names[i]) 161 events <- event{i, res, err} 162 <-permissions 163 }(i) 164 } 165 166 // wait for all inputs, before closing the `events` channel 167 inputs.Wait() 168 } 169 170 // handleInput handles each work-item for func handleInputs 171 func handleInput(path string) (stats, error) { 172 var res stats 173 res.name = path 174 175 if path == `-` { 176 err := res.updateStats(os.Stdin) 177 return res, err 178 } 179 180 f, err := os.Open(path) 181 if err != nil { 182 res.result = resultError 183 // on windows, file-not-found error messages may mention `CreateFile`, 184 // even when trying to open files in read-only mode 185 return res, errors.New(`can't open file named ` + path) 186 } 187 defer f.Close() 188 189 err = res.updateStats(f) 190 return res, err 191 } 192 193 // handleOutput asynchronously updates output as results are known, whether 194 // it's errors or successful results; returns whether it succeeded, which 195 // means no errors happened 196 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) { 197 ok = true 198 bw := bufio.NewWriter(w) 199 defer bw.Flush() 200 201 results := make([]stats, rescount) 202 203 // keep track of which tasks are over, so that on each event all leading 204 // results which are ready are shown: all of this ensures prompt output 205 // updates as soon as results come in, while keeping the original order 206 // of the names/filepaths given 207 resultsLeft := results 208 209 for v := range events { 210 results[v.Index] = v.Stats 211 if v.Err != nil { 212 ok = false 213 bw.Flush() 214 showError(v.Err) 215 216 // stay in the current loop, in case this failure was keeping 217 // previous successes from showing up 218 } 219 220 n := countLeadingReady(resultsLeft) 221 222 for _, res := range resultsLeft[:n] { 223 if err := showResult(bw, res); err != nil { 224 // assume later stages/apps in a pipe had enough input and 225 // quit successfully, so quit successfully too 226 return true 227 } 228 } 229 resultsLeft = resultsLeft[n:] 230 231 // flush output-buffer only if anything new was shown 232 if n > 0 { 233 bw.Flush() 234 } 235 } 236 237 return ok 238 } 239 240 // showError standardizes how errors from this app look 241 func showError(err error) { 242 os.Stderr.WriteString("\x1b[31m") 243 os.Stderr.WriteString(err.Error()) 244 os.Stderr.WriteString("\x1b[0m\n") 245 } 246 247 // showResult does what it says 248 func showResult(w *bufio.Writer, res stats) error { 249 if res.result == resultError { 250 return nil 251 } 252 253 var buf [64]byte 254 w.WriteString(res.name) 255 w.WriteByte('\t') 256 w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10)) 257 w.WriteByte('\t') 258 w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10)) 259 w.WriteByte('\t') 260 w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10)) 261 w.WriteByte('\t') 262 w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10)) 263 w.WriteByte('\t') 264 w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10)) 265 w.WriteByte('\t') 266 w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10)) 267 w.WriteByte('\t') 268 w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10)) 269 w.WriteByte('\t') 270 w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10)) 271 w.WriteByte('\t') 272 w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10)) 273 w.WriteByte('\t') 274 w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10)) 275 w.WriteByte('\t') 276 w.WriteString(bomLegend[res.bom]) 277 return w.WriteByte('\n') 278 } 279 280 // unique ensures items only appear once in the result, keeping the original 281 // slice unchanged 282 func unique(src []string) []string { 283 var unique []string 284 got := make(map[string]struct{}) 285 for _, s := range src { 286 if _, ok := got[s]; ok { 287 continue 288 } 289 unique = append(unique, s) 290 got[s] = struct{}{} 291 } 292 return unique 293 } 294 295 // findAllFiles does what it says, given a mix of file/folder paths, finding 296 // all files recursively in the case of folders 297 func findAllFiles(paths []string) (found []string, ok bool) { 298 var unique []string 299 got := make(map[string]struct{}) 300 ok = true 301 302 for _, root := range paths { 303 // a dash means standard input 304 if root == `-` { 305 if _, ok := got[root]; ok { 306 continue 307 } 308 309 unique = append(unique, root) 310 got[root] = struct{}{} 311 continue 312 } 313 314 _, err := os.Stat(root) 315 if os.IsNotExist(err) { 316 ok = false 317 // on windows, file-not-found error messages may mention `CreateFile`, 318 // even when trying to open files in read-only mode 319 err := errors.New(`can't find file/folder named ` + root) 320 showError(err) 321 continue 322 } 323 324 err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { 325 if err != nil { 326 return err 327 } 328 329 if d.IsDir() { 330 return nil 331 } 332 333 if _, ok := got[path]; ok { 334 return nil 335 } 336 337 unique = append(unique, path) 338 got[path] = struct{}{} 339 return nil 340 }) 341 342 if err != nil { 343 ok = false 344 showError(err) 345 } 346 } 347 348 return unique, ok 349 } 350 351 // counter makes it easy to change the int-size of almost all counters 352 type counter int 353 354 // statResult constrains possible result-states/values in type stats 355 type statResult int 356 357 const ( 358 // resultPending is the default not-yet-ready result-status 359 resultPending = statResult(0) 360 361 // resultError means result should show as an error, instead of data 362 resultError = statResult(1) 363 364 // resultSuccess means result can be shown 365 resultSuccess = statResult(2) 366 ) 367 368 type bomType int 369 370 const ( 371 noBOM = bomType(0) 372 utf8BOM = bomType(1) 373 utf16leBOM = bomType(2) 374 utf16beBOM = bomType(3) 375 utf32leBOM = bomType(4) 376 utf32beBOM = bomType(5) 377 ) 378 379 // bomLegend has the string-equivalents of the `bomType` constants 380 var bomLegend = []string{ 381 ``, 382 `UTF-8`, 383 `UTF-16 LE`, 384 `UTF-16 BE`, 385 `UTF-32 LE`, 386 `UTF-32 BE`, 387 } 388 389 // stats has all the size-stats for some input, as well as a way to 390 // skip showing results, in case of an error such as `file not found` 391 type stats struct { 392 // bytes counts all bytes read 393 bytes int 394 395 // lines counts lines, and is 0 only when the byte-count is also 0 396 lines counter 397 398 // maxWidth is maximum byte-width of lines, excluding carriage-returns 399 // and/or line-feeds 400 maxWidth counter 401 402 // nulls counts all-bits-off bytes 403 nulls counter 404 405 // fulls counts all-bits-on bytes 406 fulls counter 407 408 // highs counts bytes with their `top` (highest-order) bit on 409 highs counter 410 411 // spaces counts ASCII spaces 412 spaces counter 413 414 // tabs counts ASCII tabs 415 tabs counter 416 417 // trailing counts lines with trailing spaces in them 418 trailing counter 419 420 // lf counts ASCII line-feeds as their own byte-values: this means its 421 // value will always be at least the same as field `crlf` 422 lf counter 423 424 // crlf counts ASCII CRLF byte-pairs 425 crlf counter 426 427 // the type of byte-order mark detected 428 bom bomType 429 430 // name is the filepath of the file/source these stats are about 431 name string 432 433 // results keeps track of whether results are valid and/or ready 434 result statResult 435 } 436 437 // updateStats does what it says, reading everything from a reader 438 func (res *stats) updateStats(r io.Reader) error { 439 err := res.updateUsing(r) 440 if err == io.EOF { 441 err = nil 442 } 443 444 if err == nil { 445 res.result = resultSuccess 446 } else { 447 res.result = resultError 448 } 449 return err 450 } 451 452 func checkBOM(data []byte) bomType { 453 d := data 454 l := len(data) 455 456 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 457 return utf8BOM 458 } 459 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 460 return utf32leBOM 461 } 462 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 463 return utf32beBOM 464 } 465 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 466 return utf16leBOM 467 } 468 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 469 return utf16beBOM 470 } 471 472 return noBOM 473 } 474 475 // updateUsing helps func updateStats do its job 476 func (res *stats) updateUsing(r io.Reader) error { 477 var buf [32 * 1024]byte 478 var tallies [256]uint64 479 480 var width counter 481 var prev1, prev2 byte 482 483 for { 484 n, err := r.Read(buf[:]) 485 if n < 1 { 486 res.lines = counter(tallies['\n']) 487 res.tabs = counter(tallies['\t']) 488 res.spaces = counter(tallies[' ']) 489 res.lf = counter(tallies['\n']) 490 res.nulls = counter(tallies[0]) 491 res.fulls = counter(tallies[255]) 492 for i := 128; i < 256; i++ { 493 res.highs += counter(tallies[i]) 494 } 495 496 if err == io.EOF { 497 return res.handleEnd(width, prev1, prev2) 498 } 499 return err 500 } 501 502 chunk := buf[:n] 503 if res.bytes == 0 { 504 res.bom = checkBOM(chunk) 505 } 506 res.bytes += n 507 508 for _, b := range chunk { 509 // count values without branching, because it's fun 510 tallies[b]++ 511 512 // handle line-feeds 513 if b == '\n' { 514 crlf := count(prev1, '\r') 515 res.crlf += crlf 516 517 // count lines with trailing spaces, whether these end with 518 // a CRLF byte-pair or just a line-feed byte 519 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 520 res.trailing++ 521 } 522 523 // exclude any CR from the current line's width-count 524 width -= crlf 525 if res.maxWidth < width { 526 res.maxWidth = width 527 } 528 529 prev2 = prev1 530 prev1 = b 531 width = 0 532 continue 533 } 534 535 prev2 = prev1 536 prev1 = b 537 width++ 538 } 539 } 540 } 541 542 // handleEnd fixes/finalizes stats when input data end; this func is only 543 // meant to be used by func updateStats, since it takes some of the latter's 544 // local variables 545 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 546 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 547 res.trailing++ 548 } 549 550 if res.maxWidth < width { 551 res.maxWidth = width 552 } 553 554 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 555 // standard cmd-line tool `wc` 556 if res.bytes > 0 && prev1 != '\n' { 557 res.lines++ 558 } 559 560 return nil 561 } 562 563 // count checks if 2 bytes are the same, returning either 0 or 1, which can 564 // be added directly/branchlessly to totals 565 func count(x, y byte) counter { 566 if x != y { 567 return 0 568 } 569 return 1 570 } 571 572 // countLeadingReady finds how many items are ready to show at the start of a 573 // results-slice, which ensures output matches the original item-order 574 func countLeadingReady(values []stats) int { 575 for i, v := range values { 576 if v.result == resultPending { 577 return i 578 } 579 } 580 return len(values) 581 }