File: coby.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 # coby [filepaths/URIs...]
  27 #
  28 # COunt BYtes finds various byte-related stats for the files/URIs given. When
  29 # given no named inputs, it uses standard input by default.
  30 
  31 
  32 from multiprocessing import Pool
  33 from sys import argv, exit, stderr, stdin, stdout
  34 from typing import Any, List
  35 from urllib.parse import urlparse, urlunparse
  36 from urllib.request import urlopen
  37 
  38 
  39 # info is the message shown when the leading argument is one of the standard
  40 # cmd-line help options
  41 info = '''
  42 coby [filepaths/URIs...]
  43 
  44 COunt BYtes finds various byte-related stats for the files/URIs given. When
  45 given no named inputs, it uses standard input by default.
  46 '''.strip()
  47 
  48 # a leading help-option arg means show the help message and quit
  49 if len(argv) == 2 and argv[1].lower() in ('-h', '--h', '-help', '--help'):
  50     print(info, file=stderr)
  51     exit(0)
  52 
  53 
  54 def fail(msg, code: int = 1) -> None:
  55     '''Show the error message given, and quit the app right away.'''
  56     print(f'\x1b[31m{msg}\x1b[0m', file=stderr)
  57     exit(code)
  58 
  59 
  60 def count_bytes(src) -> List[Any]:
  61     n = 0
  62     lf = 0
  63     lines = 0
  64     crlf = 0
  65     spaces = 0
  66     tabs = 0
  67     trails = 0
  68     nulls = 0
  69     fulls = 0
  70     highs = 0
  71 
  72     # counting lines with trailing spaces needs remembering the previous 2
  73     # bytes, as the last trailing space in a line can come either before a
  74     # single line-feed byte, or a CRLF byte-pair
  75     prev2 = 0
  76     prev1 = 0
  77 
  78     for chunk in src:
  79         n += len(chunk)
  80         # bulk-counting via bytes.count is noticeably faster
  81         lf += chunk.count(10) # 10 is ord('\n')
  82         tabs += chunk.count(9) # 9 is ord('\t')
  83         spaces += chunk.count(32) # 32 is ord(' ')
  84         nulls += chunk.count(0)
  85         fulls += chunk.count(255)
  86 
  87         # some stats must be handled byte-by-byte, as slow as that can be
  88         for b in chunk:
  89             is10 = b == 10 # 10 is ord('\n')
  90             crlf += is10 and prev1 == 13 # 13 is ord('\r')
  91             trails += (is10 and prev1 == 32) or (prev1 == 13 and prev2 == 32)
  92             highs += int(b >= 128)
  93             # notice how the last 2 bytes are remembered even across chunks
  94             prev2 = prev1
  95             prev1 = b
  96 
  97     lines = lf
  98     if lines == 0 and n > 0:
  99         lines += 1
 100     return [n, lines, lf, crlf, trails, tabs, spaces, nulls, fulls, highs]
 101 
 102 
 103 def handle_named_input(path: str) -> List[Any]:
 104     if path == '-':
 105         return count_bytes(stdin.buffer)
 106 
 107     if path.startswith('https://') or path.startswith('http://'):
 108         with urlopen(path) as inp:
 109             return count_bytes(inp)
 110 
 111     with open(path, 'rb') as inp:
 112         return count_bytes(inp)
 113 
 114 
 115 header = '''
 116 name\tbytes\tlines\tlf\tcrlf\ttrails\ttabs\tspaces\tnulls\tfulls\thighs
 117 '''.strip()
 118 
 119 try:
 120     args = argv[1:]
 121     if args.count('-') > 1:
 122         msg = 'reading from `-` (standard input) more than once not allowed'
 123         raise ValueError(msg)
 124 
 125     # given no named inputs, just use stdin
 126     if len(args) == 0:
 127         args = ['-']
 128 
 129     stdout.reconfigure(newline='\n', encoding='utf-8')
 130     # show header line right away, to reassure users something's happening
 131     stdout.write(header)
 132     stdout.write('\n')
 133 
 134     if len(args) == 1:
 135         # don't bother starting multiple interpreters for a single input
 136         results = [handle_named_input(args[0])]
 137     else:
 138         # vastly speed-up script by handling multiple inputs concurrently
 139         with Pool(processes=min(4, len(args))) as pool:
 140             results = pool.map(handle_named_input, args)
 141 
 142     for name, counts in zip(args, results):
 143         stdout.write(name)
 144         for e in counts:
 145             stdout.write('\t')
 146             stdout.write(str(e))
 147         stdout.write('\n')
 148 except Exception as e:
 149     fail(e, 1)