File: coby.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2024 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 Single-file source-code for coby: this version has no http(s) support. Even 27 the unit-tests from the original coby are omitted. 28 29 To compile a smaller-sized command-line app, you can use the `go` command as 30 follows: 31 32 go build -ldflags "-s -w" -trimpath coby.go 33 */ 34 35 package main 36 37 import ( 38 "bufio" 39 "errors" 40 "io" 41 "io/fs" 42 "os" 43 "path/filepath" 44 "runtime" 45 "strconv" 46 "sync" 47 ) 48 49 const info = ` 50 coby [files/folders...] 51 52 53 COunt BYtes finds out some simple byte-related stats, counting 54 55 - bytes 56 - lines 57 - how many lines have trailing spaces 58 - how many lines end with a CRLF pair 59 - all-off (0) bytes 60 - all-on (255) bytes 61 - high-bytes (128+) 62 - which (if any) byte-order mark the data start with 63 64 The output is TSV (tab-separated values) lines, where the first line has 65 all the column names. 66 67 When no filepaths are given, the standard input is used by default. All 68 folder names given expand recursively into all filenames in them. 69 ` 70 71 // header is the first output line 72 var header = []string{ 73 `name`, 74 `bytes`, 75 `runes`, 76 `lines`, 77 `lf`, 78 `crlf`, 79 `spaces`, 80 `tabs`, 81 `trails`, 82 `nulls`, 83 `fulls`, 84 `highs`, 85 `bom`, 86 } 87 88 // event has what the output-reporting task needs to show the results of a 89 // task which has just completed, perhaps unsuccessfully 90 type event struct { 91 // Index points to the task's entry in the results-slice 92 Index int 93 94 // Stats has all the byte-related stats 95 Stats stats 96 97 // Err is the completed task's error, or lack of 98 Err error 99 } 100 101 func main() { 102 if len(os.Args) > 1 { 103 switch os.Args[1] { 104 case `-h`, `--h`, `-help`, `--help`: 105 os.Stderr.WriteString(info[1:]) 106 return 107 } 108 } 109 110 // show first/heading line right away, to let users know things are 111 // happening 112 for i, s := range header { 113 if i > 0 { 114 os.Stdout.WriteString("\t") 115 } 116 os.Stdout.WriteString(s) 117 } 118 // assume an error means later stages/apps in a pipe had enough input and 119 // quit successfully, so quit successfully too 120 _, err := os.Stdout.WriteString("\n") 121 if err != nil { 122 return 123 } 124 125 // names has all filepaths given, ignoring repetitions 126 names, ok := findAllFiles(unique(os.Args[1:])) 127 if !ok { 128 os.Exit(1) 129 } 130 if len(names) == 0 { 131 names = []string{`-`} 132 } 133 134 events := make(chan event) 135 go handleInputs(names, events) 136 if !handleOutput(os.Stdout, len(names), events) { 137 os.Exit(1) 138 } 139 } 140 141 // handleInputs launches all the tasks which do the actual work, limiting how 142 // many inputs are being worked on at the same time 143 func handleInputs(names []string, events chan event) { 144 // allow output-reporter task to end, and thus the app 145 defer close(events) 146 147 // permissions limits how many worker tasks can be active at the same 148 // time: when given many filepaths to work on, rate-limiting avoids 149 // a massive number of concurrent tasks which read and process input 150 permissions := make(chan struct{}, runtime.NumCPU()) 151 defer close(permissions) 152 153 var inputs sync.WaitGroup 154 for i := range names { 155 // wait until some concurrency-room is available 156 permissions <- struct{}{} 157 inputs.Add(1) 158 159 go func(i int) { 160 defer inputs.Done() 161 res, err := handleInput(names[i]) 162 events <- event{i, res, err} 163 <-permissions 164 }(i) 165 } 166 167 // wait for all inputs, before closing the `events` channel 168 inputs.Wait() 169 } 170 171 // handleInput handles each work-item for func handleInputs 172 func handleInput(path string) (stats, error) { 173 var res stats 174 res.name = path 175 176 if path == `-` { 177 err := res.updateStats(os.Stdin) 178 return res, err 179 } 180 181 f, err := os.Open(path) 182 if err != nil { 183 res.result = resultError 184 // on windows, file-not-found error messages may mention `CreateFile`, 185 // even when trying to open files in read-only mode 186 return res, errors.New(`can't open file named ` + path) 187 } 188 defer f.Close() 189 190 err = res.updateStats(f) 191 return res, err 192 } 193 194 // handleOutput asynchronously updates output as results are known, whether 195 // it's errors or successful results; returns whether it succeeded, which 196 // means no errors happened 197 func handleOutput(w io.Writer, rescount int, events chan event) (ok bool) { 198 ok = true 199 bw := bufio.NewWriter(w) 200 defer bw.Flush() 201 202 results := make([]stats, rescount) 203 204 // keep track of which tasks are over, so that on each event all leading 205 // results which are ready are shown: all of this ensures prompt output 206 // updates as soon as results come in, while keeping the original order 207 // of the names/filepaths given 208 resultsLeft := results 209 210 for v := range events { 211 results[v.Index] = v.Stats 212 if v.Err != nil { 213 ok = false 214 bw.Flush() 215 showError(v.Err) 216 217 // stay in the current loop, in case this failure was keeping 218 // previous successes from showing up 219 } 220 221 n := countLeadingReady(resultsLeft) 222 223 for _, res := range resultsLeft[:n] { 224 if err := showResult(bw, res); err != nil { 225 // assume later stages/apps in a pipe had enough input and 226 // quit successfully, so quit successfully too 227 return true 228 } 229 } 230 resultsLeft = resultsLeft[n:] 231 232 // flush output-buffer only if anything new was shown 233 if n > 0 { 234 bw.Flush() 235 } 236 } 237 238 return ok 239 } 240 241 // showError standardizes how errors from this app look 242 func showError(err error) { 243 os.Stderr.WriteString("\x1b[31m") 244 os.Stderr.WriteString(err.Error()) 245 os.Stderr.WriteString("\x1b[0m\n") 246 } 247 248 // showResult does what it says 249 func showResult(w *bufio.Writer, res stats) error { 250 if res.result == resultError { 251 return nil 252 } 253 254 var buf [64]byte 255 w.WriteString(res.name) 256 w.Write([]byte{'\t'}) 257 w.Write(strconv.AppendUint(buf[:0], uint64(res.bytes), 10)) 258 w.Write([]byte{'\t'}) 259 w.Write(strconv.AppendUint(buf[:0], uint64(res.runes), 10)) 260 w.Write([]byte{'\t'}) 261 w.Write(strconv.AppendUint(buf[:0], uint64(res.lines), 10)) 262 w.Write([]byte{'\t'}) 263 w.Write(strconv.AppendUint(buf[:0], uint64(res.lf), 10)) 264 w.Write([]byte{'\t'}) 265 w.Write(strconv.AppendUint(buf[:0], uint64(res.crlf), 10)) 266 w.Write([]byte{'\t'}) 267 w.Write(strconv.AppendUint(buf[:0], uint64(res.spaces), 10)) 268 w.Write([]byte{'\t'}) 269 w.Write(strconv.AppendUint(buf[:0], uint64(res.tabs), 10)) 270 w.Write([]byte{'\t'}) 271 w.Write(strconv.AppendUint(buf[:0], uint64(res.trailing), 10)) 272 w.Write([]byte{'\t'}) 273 w.Write(strconv.AppendUint(buf[:0], uint64(res.nulls), 10)) 274 w.Write([]byte{'\t'}) 275 w.Write(strconv.AppendUint(buf[:0], uint64(res.fulls), 10)) 276 w.Write([]byte{'\t'}) 277 w.Write(strconv.AppendUint(buf[:0], uint64(res.highs), 10)) 278 w.Write([]byte{'\t'}) 279 w.WriteString(bomLegend[res.bom]) 280 _, err := w.Write([]byte{'\n'}) 281 return err 282 } 283 284 // unique ensures items only appear once in the result, keeping the original 285 // slice unchanged 286 func unique(src []string) []string { 287 var unique []string 288 got := make(map[string]struct{}) 289 for _, s := range src { 290 if _, ok := got[s]; ok { 291 continue 292 } 293 unique = append(unique, s) 294 got[s] = struct{}{} 295 } 296 return unique 297 } 298 299 // findAllFiles does what it says, given a mix of file/folder paths, finding 300 // all files recursively in the case of folders 301 func findAllFiles(paths []string) (found []string, ok bool) { 302 var unique []string 303 got := make(map[string]struct{}) 304 ok = true 305 306 for _, root := range paths { 307 // a dash means standard input 308 if root == `-` { 309 if _, ok := got[root]; ok { 310 continue 311 } 312 313 unique = append(unique, root) 314 got[root] = struct{}{} 315 continue 316 } 317 318 _, err := os.Stat(root) 319 if os.IsNotExist(err) { 320 ok = false 321 // on windows, file-not-found error messages may mention `CreateFile`, 322 // even when trying to open files in read-only mode 323 err := errors.New(`can't find file/folder named ` + root) 324 showError(err) 325 continue 326 } 327 328 err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { 329 if err != nil { 330 return err 331 } 332 333 if d.IsDir() { 334 return nil 335 } 336 337 if _, ok := got[path]; ok { 338 return nil 339 } 340 341 unique = append(unique, path) 342 got[path] = struct{}{} 343 return nil 344 }) 345 346 if err != nil { 347 ok = false 348 showError(err) 349 } 350 } 351 352 return unique, ok 353 } 354 355 // isZero enables branchless-counting, when xor-compared bytes are used 356 // as indices for it 357 var isZero = [256]byte{1} 358 359 // counter makes it easy to change the int-size of almost all counters 360 type counter int 361 362 // statResult constrains possible result-states/values in type stats 363 type statResult int 364 365 const ( 366 // resultPending is the default not-yet-ready result-status 367 resultPending = statResult(0) 368 369 // resultError signals result should show as an error, instead of data 370 resultError = statResult(1) 371 372 // resultSuccess means result can be shown 373 resultSuccess = statResult(2) 374 ) 375 376 type bomType int 377 378 const ( 379 noBOM = bomType(0) 380 utf8BOM = bomType(1) 381 utf16leBOM = bomType(2) 382 utf16beBOM = bomType(3) 383 utf32leBOM = bomType(4) 384 utf32beBOM = bomType(5) 385 ) 386 387 var bomLegend = []string{ 388 ``, 389 `UTF-8`, 390 `UTF-16 LE`, 391 `UTF-16 BE`, 392 `UTF-32 LE`, 393 `UTF-32 BE`, 394 } 395 396 // stats has all the size-stats for some input, as well as a way to 397 // skip showing results, in case of an error such as `file not found` 398 type stats struct { 399 // bytes counts all bytes read 400 bytes int 401 402 // lines counts lines, and is 0 only when the byte-count is also 0 403 lines counter 404 405 // runes counts utf-8 sequences, each of which can use up to 4 bytes and 406 // is usually a complete symbol: `emoji` country-flags are commonly-used 407 // counter-examples, as these `symbols` need 2 runes, using 8 bytes each 408 runes counter 409 410 // maxWidth is maximum byte-width of lines, excluding carriage-returns 411 // and/or line-feeds 412 maxWidth counter 413 414 // nulls counts all-bits-off bytes 415 nulls counter 416 417 // fulls counts all-bits-on bytes 418 fulls counter 419 420 // highs counts bytes with their `top` (highest-order) bit on 421 highs counter 422 423 // spaces counts ASCII spaces 424 spaces counter 425 426 // tabs counts ASCII tabs 427 tabs counter 428 429 // trailing counts lines with trailing spaces in them 430 trailing counter 431 432 // lf counts ASCII line-feeds as their own byte-values: this means its 433 // value will always be at least the same as field `crlf` 434 lf counter 435 436 // crlf counts ASCII CRLF byte-pairs 437 crlf counter 438 439 // the type of byte-order mark detected 440 bom bomType 441 442 // name is the filepath of the file/source these stats are about 443 name string 444 445 // results keeps track of whether results are valid and/or ready 446 result statResult 447 } 448 449 // updateStats does what it says, reading everything from a reader 450 func (res *stats) updateStats(r io.Reader) error { 451 err := res.updateUsing(r) 452 if err == io.EOF { 453 err = nil 454 } 455 456 if err == nil { 457 res.result = resultSuccess 458 } else { 459 res.result = resultError 460 } 461 return err 462 } 463 464 func checkBOM(data []byte) bomType { 465 d := data 466 l := len(data) 467 468 if l >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf { 469 return utf8BOM 470 } 471 if l >= 4 && d[0] == 0xff && d[1] == 0xfe && d[2] == 0 && d[3] == 0 { 472 return utf32leBOM 473 } 474 if l >= 4 && d[0] == 0 && d[1] == 0 && d[2] == 0xfe && d[3] == 0xff { 475 return utf32beBOM 476 } 477 if l >= 2 && data[0] == 0xff && data[1] == 0xfe { 478 return utf16leBOM 479 } 480 if l >= 2 && data[0] == 0xfe && data[1] == 0xff { 481 return utf16beBOM 482 } 483 484 return noBOM 485 } 486 487 // updateUsing helps func updateStats do its job 488 func (res *stats) updateUsing(r io.Reader) error { 489 var width counter 490 var prev1, prev2 byte 491 var buf [16 * 1024]byte 492 var tallies [256]uint64 493 494 for { 495 n, err := r.Read(buf[:]) 496 if n < 1 { 497 if err == io.EOF { 498 res.lines = counter(tallies['\n']) 499 res.tabs = counter(tallies['\t']) 500 res.spaces = counter(tallies[' ']) 501 res.lf = counter(tallies['\n']) 502 res.nulls = counter(tallies[0]) 503 res.fulls = counter(tallies[255]) 504 for i := 128; i < 256; i++ { 505 res.highs += counter(tallies[i]) 506 } 507 return res.handleEnd(width, prev1, prev2) 508 } 509 return err 510 } 511 512 chunk := buf[:n] 513 if res.bytes == 0 { 514 res.bom = checkBOM(chunk) 515 } 516 res.bytes += n 517 518 for _, b := range chunk { 519 // count values without branching, because it's fun 520 tallies[b]++ 521 522 // handle non-ASCII runes, assuming input is valid UTF-8 523 res.runes += 1 - count(b&0xc0, 0x80) 524 525 // handle line-feeds 526 if b == '\n' { 527 crlf := count(prev1, '\r') 528 res.crlf += crlf 529 530 // count lines with trailing spaces, whether these end with 531 // a CRLF byte-pair or just a line-feed byte 532 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 533 res.trailing++ 534 } 535 536 // exclude any CR from the current line's width-count 537 width -= crlf 538 if res.maxWidth < width { 539 res.maxWidth = width 540 } 541 542 prev2 = prev1 543 prev1 = b 544 width = 0 545 continue 546 } 547 548 prev2 = prev1 549 prev1 = b 550 width++ 551 } 552 } 553 } 554 555 // handleEnd fixes/finalizes stats when input data end; this func is only 556 // meant to be used by func updateStats, since it takes some of the latter's 557 // local variables 558 func (res *stats) handleEnd(width counter, prev1, prev2 byte) error { 559 if prev1 == ' ' || (prev2 == ' ' && prev1 == '\r') { 560 res.trailing++ 561 } 562 563 if res.maxWidth < width { 564 res.maxWidth = width 565 } 566 567 // avoid reporting 0 lines with a non-0 byte-count: this is unlike the 568 // standard cmd-line tool `wc` 569 if res.bytes > 0 && prev1 != '\n' { 570 res.lines++ 571 } 572 573 return nil 574 } 575 576 // count checks if 2 bytes are the same, returning either 0 or 1, which can 577 // be added directly/branchlessly to totals 578 // func count(x, y byte) counter { 579 // return counter(isZero[x^y]) 580 // } 581 582 // count checks if 2 bytes are the same, returning either 0 or 1, which can 583 // be added directly/branchlessly to totals 584 func count(x, y byte) counter { 585 if x != y { 586 return 0 587 } 588 return 1 589 } 590 591 // countLeadingReady finds how many items are ready to show at the start of a 592 // results-slice, which ensures output matches the original item-order 593 func countLeadingReady(values []stats) int { 594 for i, v := range values { 595 if v.result == resultPending { 596 return i 597 } 598 } 599 return len(values) 600 }