File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 Single-file source-code for coby: this version has no http(s) support. Even 27 the unit-tests from the original coby are omitted. 28 29 To compile a smaller-sized command-line app, you can use the `go` command as 30 follows: 31 32 go build -ldflags "-s -w" -trimpath coby.go 33 */ 34 35 package main 36 37 import ( 38 "bufio" 39 "errors" 40 "io" 41 "io/fs" 42 "os" 43 "path/filepath" 44 "runtime" 45 "strconv" 46 "sync" 47 ) 48 49 const info = ` 50 coby [files/folders...] 51 52 53 COunt BYtes finds out some simple byte-related stats, counting 54 55 - bytes 56 - lines 57 - how many lines have trailing spaces 58 - how many lines end with a CRLF pair 59 - all-off (0) bytes 60 - all-on (255) bytes 61 - high-bytes (128+) 62 - which (if any) byte-order mark the data start with 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. 69 ` 70 71 // header is the first output line 72 var header = []string{ 73 `name`, 74 `bytes`, 75 `runes`, 76 `lines`, 77 `lf`, 78 `crlf`, 79 `spaces`, 80 `tabs`, 81 `trails`, 82 `nulls`, 83 `fulls`, 84 `highs`, 85 `bom`, 86 } 87 88 // event has what the output-reporting task needs to show the results of a 89 // task which has just completed, perhaps unsuccessfully 90 type event struct { 91 // Index points to the task's entry in the results-slice 92 Index int 93 94 // Stats has all the byte-related stats 95 Stats stats 96 97 // Err is the completed task's error, or lack of 98 Err error 99 } 100 101 func main() { 102 if len(os.Args) > 1 { 103 switch os.Args[1] { 104 case `-h`, `--h`, `-help`, `--help`: 105 os.Stderr.WriteString(info[1:]) 106 return 107 } 108 } 109 110 // show first/heading line right away, to let users know things are 111 // happening 112 for i, s := range header { 113 if i > 0 { 114 os.Stdout.WriteString("\t") 115 } 116 os.Stdout.WriteString(s) 117 } 118 // assume an error means later stages/apps in a pipe had enough input and 119 // quit successfully, so quit successfully too 120 _, err := os.Stdout.WriteString("\n") 121 if err != nil { 122 return 123 } 124 125 // names has all filepaths given, ignoring repetitions 126 names, ok := findAllFiles(unique(os.Args[1:])) 127 if !ok { 128 os.Exit(1) 129 } 130 if len(names) == 0 { 131 names = []string{`-`} 132 } 133 134 events := make(chan event) 135 go handleInputs(names, events) 136 if !handleOutput(os.Stdout, len(names), events) { 137 os.Exit(1) 138 } 139 } 140 141 // handleInputs launches all the tasks which do the actual work, limiting how 142 // many inputs are being worked on at the same time 143 func handleInputs(names []string, events chan event) { 144 // allow output-reporter task to end, and thus the app 145 defer close(events) 146 147 // permissions limits how many worker tasks can be active at the same 148 // time: when given many filepaths to work on, rate-limiting avoids 149 // a massive number of concurrent tasks which read and process input 150 permissions := make(chan struct{}, runtime.NumCPU()) 151 defer close(permissions) 152 153 var inputs sync.WaitGroup 154 for i := range names { 155 // wait until some concurrency-room is available 156 permissions <- struct{}{} 157 inputs.Add(1) 158 159 go func(i int) { 160 defer inputs.Done() 161 res, err := handleInput(names[i]) 162 events <- event{i, res, err} 163 <-permissions 164 }(i) 165 } 166 167 // wait for all inputs, before closing the `events` channel 168 inputs.Wait() 169 } 170 171 // handleInput handles each work-item for func handleInputs 172 func handleInput(path string) (stats, error) { 173 var res stats 174 res.name = path 175 176 if path == `-` { 177 err := res.updateStats(os.Stdin) 178 return res, err 179 } 180 181 f, err := os.Open(path) 182 if err != nil { 183 res.result = resultError 184 // on windows, file-not-found error messages may mention `CreateFile`, 185 // even when trying to open files in read-only mode 186 return res, errors.New(`can't open file named ` + path) 187 } 188 defer f.Close() 189 190 err = res.updateStats(f) 191 return res, err 192 } 193 194 // handleOutput asynchronously updates output as results are known, whether 195 // it's errors or successful results; returns whether it succeeded, which 196 // means no errors happened 197 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) { 198 ok = true 199 bw := bufio.NewWriter(w) 200 defer bw.Flush() 201 202 results := make([]stats, rescount) 203 204 // keep track of which tasks are over, so that on each event all leading 205 // results which are ready are shown: all of this ensures prompt output 206 // updates as soon as results come in, while keeping the original order 207 // of the names/filepaths given 208 resultsLeft := results 209 210 for v := range events { 211 results[v.Index] = v.Stats 212 if v.Err != nil { 213 ok = false 214 bw.Flush() 215 showError(v.Err) 216 217 // stay in the current loop, in case this failure was keeping 218 // previous successes from showing up 219 } 220 221 n := countLeadingReady(resultsLeft) 222 223 for _, res := range resultsLeft[:n] { 224 if err := showResult(bw, res); err != nil { 225 // assume later stages/apps in a pipe had enough input and 226 // quit successfully, so quit successfully too 227 return true 228 } 229 } 230 resultsLeft = resultsLeft[n:] 231 232 // flush output-buffer only if anything new was shown 233 if n > 0 { 234 bw.Flush() 235 } 236 } 237 238 return ok 239 } 240 241 // showError standardizes how errors from this app look 242 func showError(err error) { 243 os.Stderr.WriteString("\x1b[31m") 244 os.Stderr.WriteString(err.Error()) 245 os.Stderr.WriteString("\x1b[0m\n") 246 } 247 248 // showResult does what it says 249 func showResult(w *bufio.Writer, res stats) error { 250 if res.result == resultError { 251 return nil 252 } 253 254 var buf [64]byte 255 w.WriteString(res.name) 256 w.WriteByte('\t') 257 w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10)) 258 w.WriteByte('\t') 259 w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10)) 260 w.WriteByte('\t') 261 w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10)) 262 w.WriteByte('\t') 263 w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10)) 264 w.WriteByte('\t') 265 w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10)) 266 w.WriteByte('\t') 267 w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10)) 268 w.WriteByte('\t') 269 w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10)) 270 w.WriteByte('\t') 271 w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10)) 272 w.WriteByte('\t') 273 w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10)) 274 w.WriteByte('\t') 275 w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10)) 276 w.WriteByte('\t') 277 w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10)) 278 w.WriteByte('\t') 279 w.WriteString(bomLegend[res.bom]) 280 return w.WriteByte('\n') 281 } 282 283 // unique ensures items only appear once in the result, keeping the original 284 // slice unchanged 285 func unique(src []string) []string { 286 var unique []string 287 got := make(map[string]struct{}) 288 for _, s := range src { 289 if _, ok := got[s]; ok { 290 continue 291 } 292 unique = append(unique, s) 293 got[s] = struct{}{} 294 } 295 return unique 296 } 297 298 // findAllFiles does what it says, given a mix of file/folder paths, finding 299 // all files recursively in the case of folders 300 func findAllFiles(paths []string) (found []string, ok bool) { 301 var unique []string 302 got := make(map[string]struct{}) 303 ok = true 304 305 for _, root := range paths { 306 // a dash means standard input 307 if root == `-` { 308 if _, ok := got[root]; ok { 309 continue 310 } 311 312 unique = append(unique, root) 313 got[root] = struct{}{} 314 continue 315 } 316 317 _, err := os.Stat(root) 318 if os.IsNotExist(err) { 319 ok = false 320 // on windows, file-not-found error messages may mention `CreateFile`, 321 // even when trying to open files in read-only mode 322 err := errors.New(`can't find file/folder named ` + root) 323 showError(err) 324 continue 325 } 326 327 err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { 328 if err != nil { 329 return err 330 } 331 332 if d.IsDir() { 333 return nil 334 } 335 336 if _, ok := got[path]; ok { 337 return nil 338 } 339 340 unique = append(unique, path) 341 got[path] = struct{}{} 342 return nil 343 }) 344 345 if err != nil { 346 ok = false 347 showError(err) 348 } 349 } 350 351 return unique, ok 352 } 353 354 // counter makes it easy to change the int-size of almost all counters 355 type counter int 356 357 // statResult constrains possible result-states/values in type stats 358 type statResult int 359 360 const ( 361 // resultPending is the default not-yet-ready result-status 362 resultPending = statResult(0) 363 364 // resultError means result should show as an error, instead of data 365 resultError = statResult(1) 366 367 // resultSuccess means result can be shown 368 resultSuccess = statResult(2) 369 ) 370 371 type bomType int 372 373 const ( 374 noBOM = bomType(0) 375 utf8BOM = bomType(1) 376 utf16leBOM = bomType(2) 377 utf16beBOM = bomType(3) 378 utf32leBOM = bomType(4) 379 utf32beBOM = bomType(5) 380 ) 381 382 // bomLegend has the string-equivalents of the `bomType` constants 383 var bomLegend = []string{ 384 ``, 385 `UTF-8`, 386 `UTF-16 LE`, 387 `UTF-16 BE`, 388 `UTF-32 LE`, 389 `UTF-32 BE`, 390 } 391 392 // stats has all the size-stats for some input, as well as a way to 393 // skip showing results, in case of an error such as `file not found` 394 type stats struct { 395 // bytes counts all bytes read 396 bytes int 397 398 // lines counts lines, and is 0 only when the byte-count is also 0 399 lines counter 400 401 // runes counts utf-8 sequences, each of which can use up to 4 bytes and 402 // is usually a complete symbol: `emoji` country-flags are commonly-used 403 // counter-examples, as these `symbols` need 2 runes, using 8 bytes each 404 runes counter 405 406 // maxWidth is maximum byte-width of lines, excluding carriage-returns 407 // and/or line-feeds 408 maxWidth counter 409 410 // nulls counts all-bits-off bytes 411 nulls counter 412 413 // fulls counts all-bits-on bytes 414 fulls counter 415 416 // highs counts bytes with their `top` (highest-order) bit on 417 highs counter 418 419 // spaces counts ASCII spaces 420 spaces counter 421 422 // tabs counts ASCII tabs 423 tabs counter 424 425 // trailing counts lines with trailing spaces in them 426 trailing counter 427 428 // lf counts ASCII line-feeds as their own byte-values: this means its 429 // value will always be at least the same as field `crlf` 430 lf counter 431 432 // crlf counts ASCII CRLF byte-pairs 433 crlf counter 434 435 // the type of byte-order mark detected 436 bom bomType 437 438 // name is the filepath of the file/source these stats are about 439 name string 440 441 // results keeps track of whether results are valid and/or ready 442 result statResult 443 } 444 445 // updateStats does what it says, reading everything from a reader 446 func (res *stats) updateStats(r io.Reader) error { 447 err := res.updateUsing(r) 448 if err == io.EOF { 449 err = nil 450 } 451 452 if err == nil { 453 res.result = resultSuccess 454 } else { 455 res.result = resultError 456 } 457 return err 458 } 459 460 func checkBOM(data []byte) bomType { 461 d := data 462 l := len(data) 463 464 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 465 return utf8BOM 466 } 467 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 468 return utf32leBOM 469 } 470 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 471 return utf32beBOM 472 } 473 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 474 return utf16leBOM 475 } 476 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 477 return utf16beBOM 478 } 479 480 return noBOM 481 } 482 483 // updateUsing helps func updateStats do its job 484 func (res *stats) updateUsing(r io.Reader) error { 485 var buf [32 * 1024]byte 486 var tallies [256]uint64 487 488 var width counter 489 var prev1, prev2 byte 490 491 for { 492 n, err := r.Read(buf[:]) 493 if n < 1 { 494 res.lines = counter(tallies['\n']) 495 res.tabs = counter(tallies['\t']) 496 res.spaces = counter(tallies[' ']) 497 res.lf = counter(tallies['\n']) 498 res.nulls = counter(tallies[0]) 499 res.fulls = counter(tallies[255]) 500 for i := 128; i < 256; i++ { 501 res.highs += counter(tallies[i]) 502 } 503 504 if err == io.EOF { 505 return res.handleEnd(width, prev1, prev2) 506 } 507 return err 508 } 509 510 chunk := buf[:n] 511 if res.bytes == 0 { 512 res.bom = checkBOM(chunk) 513 } 514 res.bytes += n 515 516 for _, b := range chunk { 517 // count values without branching, because it's fun 518 tallies[b]++ 519 520 // handle non-ASCII runes, assuming input is valid UTF-8 521 res.runes += 1 - count(b&0xc0, 0x80) 522 523 // handle line-feeds 524 if b == '\n' { 525 crlf := count(prev1, '\r') 526 res.crlf += crlf 527 528 // count lines with trailing spaces, whether these end with 529 // a CRLF byte-pair or just a line-feed byte 530 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 531 res.trailing++ 532 } 533 534 // exclude any CR from the current line's width-count 535 width -= crlf 536 if res.maxWidth < width { 537 res.maxWidth = width 538 } 539 540 prev2 = prev1 541 prev1 = b 542 width = 0 543 continue 544 } 545 546 prev2 = prev1 547 prev1 = b 548 width++ 549 } 550 } 551 } 552 553 // handleEnd fixes/finalizes stats when input data end; this func is only 554 // meant to be used by func updateStats, since it takes some of the latter's 555 // local variables 556 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 557 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 558 res.trailing++ 559 } 560 561 if res.maxWidth < width { 562 res.maxWidth = width 563 } 564 565 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 566 // standard cmd-line tool `wc` 567 if res.bytes > 0 && prev1 != '\n' { 568 res.lines++ 569 } 570 571 return nil 572 } 573 574 // count checks if 2 bytes are the same, returning either 0 or 1, which can 575 // be added directly/branchlessly to totals 576 func count(x, y byte) counter { 577 if x != y { 578 return 0 579 } 580 return 1 581 } 582 583 // countLeadingReady finds how many items are ready to show at the start of a 584 // results-slice, which ensures output matches the original item-order 585 func countLeadingReady(values []stats) int { 586 for i, v := range values { 587 if v.result == resultPending { 588 return i 589 } 590 } 591 return len(values) 592 }