File: coby.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 # coby [filepaths/URIs...] 27 # 28 # COunt BYtes finds various byte-related stats for the files/URIs given. When 29 # given no named inputs, it uses standard input by default. 30 31 32 from multiprocessing import Pool 33 from sys import argv, exit, stderr, stdin, stdout 34 from typing import Any, List 35 from urllib.parse import urlparse, urlunparse 36 from urllib.request import urlopen 37 38 39 # info is the message shown when the leading argument is one of the standard 40 # cmd-line help options 41 info = ''' 42 coby [filepaths/URIs...] 43 44 COunt BYtes finds various byte-related stats for the files/URIs given. When 45 given no named inputs, it uses standard input by default. 46 '''.strip() 47 48 # a leading help-option arg means show the help message and quit 49 if len(argv) == 2 and argv[1].lower() in ('-h', '--h', '-help', '--help'): 50 print(info, file=stderr) 51 exit(0) 52 53 54 def fail(msg, code: int = 1) -> None: 55 '''Show the error message given, and quit the app right away.''' 56 print(f'\x1b[31m{msg}\x1b[0m', file=stderr) 57 exit(code) 58 59 60 def count_bytes(src) -> List[Any]: 61 n = 0 62 lf = 0 63 lines = 0 64 crlf = 0 65 spaces = 0 66 tabs = 0 67 trails = 0 68 nulls = 0 69 fulls = 0 70 highs = 0 71 72 # counting lines with trailing spaces needs remembering the previous 2 73 # bytes, as the last trailing space in a line can come either before a 74 # single line-feed byte, or a CRLF byte-pair 75 prev2 = 0 76 prev1 = 0 77 78 for chunk in src: 79 n += len(chunk) 80 # bulk-counting via bytes.count is noticeably faster 81 lf += chunk.count(10) # 10 is ord('\n') 82 tabs += chunk.count(9) # 9 is ord('\t') 83 spaces += chunk.count(32) # 32 is ord(' ') 84 nulls += chunk.count(0) 85 fulls += chunk.count(255) 86 87 # some stats must be handled byte-by-byte, as slow as that can be 88 for b in chunk: 89 is10 = b == 10 # 10 is ord('\n') 90 crlf += is10 and prev1 == 13 # 13 is ord('\r') 91 trails += (is10 and prev1 == 32) or (prev1 == 13 and prev2 == 32) 92 highs += int(b >= 128) 93 # notice how the last 2 bytes are remembered even across chunks 94 prev2 = prev1 95 prev1 = b 96 97 lines = lf 98 if lines == 0 and n > 0: 99 lines += 1 100 return [n, lines, lf, crlf, trails, tabs, spaces, nulls, fulls, highs] 101 102 103 def handle_named_input(path: str) -> List[Any]: 104 if path == '-': 105 return count_bytes(stdin.buffer) 106 107 if path.startswith('https://') or path.startswith('http://'): 108 with urlopen(path) as inp: 109 return count_bytes(inp) 110 111 with open(path, 'rb') as inp: 112 return count_bytes(inp) 113 114 115 header = ''' 116 name\tbytes\tlines\tlf\tcrlf\ttrails\ttabs\tspaces\tnulls\tfulls\thighs 117 '''.strip() 118 119 try: 120 args = argv[1:] 121 if args.count('-') > 1: 122 msg = 'reading from `-` (standard input) more than once not allowed' 123 raise ValueError(msg) 124 125 # given no named inputs, just use stdin 126 if len(args) == 0: 127 args = ['-'] 128 129 stdout.reconfigure(newline='\n', encoding='utf-8') 130 # show header line right away, to reassure users something's happening 131 stdout.write(header) 132 stdout.write('\n') 133 134 if len(args) == 1: 135 # don't bother starting multiple interpreters for a single input 136 results = [handle_named_input(args[0])] 137 else: 138 # vastly speed-up script by handling multiple inputs concurrently 139 with Pool(processes=min(4, len(args))) as pool: 140 results = pool.map(handle_named_input, args) 141 142 for name, counts in zip(args, results): 143 stdout.write(name) 144 for e in counts: 145 stdout.write('\t') 146 stdout.write(str(e)) 147 stdout.write('\n') 148 except Exception as e: 149 fail(e, 1)