File: nh.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from os import fstat
  27 from sys import argv, exit, stderr, stdin, stdout
  28 
  29 
  30 info = '''
  31 nh [options...] [filepaths/URIs...]
  32 
  33 
  34 Nice Hexadecimals is a byte-viewer which shows bytes as base-16 values,
  35 using various ANSI styles to color-code output.
  36 
  37 Output lines end with a panel showing all ASCII sequences detected along:
  38 each such panel also includes all ASCII from the next row as well, since
  39 not doing that would make grepping/matching whole strings less reliable,
  40 as some matches may be missed simply due to the narrowness of the panel.
  41 
  42 Options, where leading double-dashes are also allowed:
  43 
  44     -h         show this help message
  45     -help      same as -h
  46 
  47     -n         narrow output, which fits 80-column mode
  48     -narrow    same as -n
  49 '''
  50 
  51 
  52 # bytes2styled_hex has `pre-rendered` strings for each possible byte
  53 bytes2styled_hex = (
  54     '\x1b[38;5;111m00 ', '\x1b[38;5;246m01 ', '\x1b[38;5;246m02 ',
  55     '\x1b[38;5;246m03 ', '\x1b[38;5;246m04 ', '\x1b[38;5;246m05 ',
  56     '\x1b[38;5;246m06 ', '\x1b[38;5;246m07 ', '\x1b[38;5;246m08 ',
  57     '\x1b[38;5;246m09 ', '\x1b[38;5;246m0a ', '\x1b[38;5;246m0b ',
  58     '\x1b[38;5;246m0c ', '\x1b[38;5;246m0d ', '\x1b[38;5;246m0e ',
  59     '\x1b[38;5;246m0f ', '\x1b[38;5;246m10 ', '\x1b[38;5;246m11 ',
  60     '\x1b[38;5;246m12 ', '\x1b[38;5;246m13 ', '\x1b[38;5;246m14 ',
  61     '\x1b[38;5;246m15 ', '\x1b[38;5;246m16 ', '\x1b[38;5;246m17 ',
  62     '\x1b[38;5;246m18 ', '\x1b[38;5;246m19 ', '\x1b[38;5;246m1a ',
  63     '\x1b[38;5;246m1b ', '\x1b[38;5;246m1c ', '\x1b[38;5;246m1d ',
  64     '\x1b[38;5;246m1e ', '\x1b[38;5;246m1f ',
  65     '\x1b[38;5;72m20\x1b[38;5;239m ', '\x1b[38;5;72m21\x1b[38;5;239m!',
  66     '\x1b[38;5;72m22\x1b[38;5;239m"', '\x1b[38;5;72m23\x1b[38;5;239m#',
  67     '\x1b[38;5;72m24\x1b[38;5;239m$', '\x1b[38;5;72m25\x1b[38;5;239m%',
  68     '\x1b[38;5;72m26\x1b[38;5;239m&', '\x1b[38;5;72m27\x1b[38;5;239m\'',
  69     '\x1b[38;5;72m28\x1b[38;5;239m(', '\x1b[38;5;72m29\x1b[38;5;239m)',
  70     '\x1b[38;5;72m2a\x1b[38;5;239m*', '\x1b[38;5;72m2b\x1b[38;5;239m+',
  71     '\x1b[38;5;72m2c\x1b[38;5;239m,', '\x1b[38;5;72m2d\x1b[38;5;239m-',
  72     '\x1b[38;5;72m2e\x1b[38;5;239m.', '\x1b[38;5;72m2f\x1b[38;5;239m/',
  73     '\x1b[38;5;72m30\x1b[38;5;239m0', '\x1b[38;5;72m31\x1b[38;5;239m1',
  74     '\x1b[38;5;72m32\x1b[38;5;239m2', '\x1b[38;5;72m33\x1b[38;5;239m3',
  75     '\x1b[38;5;72m34\x1b[38;5;239m4', '\x1b[38;5;72m35\x1b[38;5;239m5',
  76     '\x1b[38;5;72m36\x1b[38;5;239m6', '\x1b[38;5;72m37\x1b[38;5;239m7',
  77     '\x1b[38;5;72m38\x1b[38;5;239m8', '\x1b[38;5;72m39\x1b[38;5;239m9',
  78     '\x1b[38;5;72m3a\x1b[38;5;239m:', '\x1b[38;5;72m3b\x1b[38;5;239m;',
  79     '\x1b[38;5;72m3c\x1b[38;5;239m<', '\x1b[38;5;72m3d\x1b[38;5;239m=',
  80     '\x1b[38;5;72m3e\x1b[38;5;239m>', '\x1b[38;5;72m3f\x1b[38;5;239m?',
  81     '\x1b[38;5;72m40\x1b[38;5;239m@', '\x1b[38;5;72m41\x1b[38;5;239mA',
  82     '\x1b[38;5;72m42\x1b[38;5;239mB', '\x1b[38;5;72m43\x1b[38;5;239mC',
  83     '\x1b[38;5;72m44\x1b[38;5;239mD', '\x1b[38;5;72m45\x1b[38;5;239mE',
  84     '\x1b[38;5;72m46\x1b[38;5;239mF', '\x1b[38;5;72m47\x1b[38;5;239mG',
  85     '\x1b[38;5;72m48\x1b[38;5;239mH', '\x1b[38;5;72m49\x1b[38;5;239mI',
  86     '\x1b[38;5;72m4a\x1b[38;5;239mJ', '\x1b[38;5;72m4b\x1b[38;5;239mK',
  87     '\x1b[38;5;72m4c\x1b[38;5;239mL', '\x1b[38;5;72m4d\x1b[38;5;239mM',
  88     '\x1b[38;5;72m4e\x1b[38;5;239mN', '\x1b[38;5;72m4f\x1b[38;5;239mO',
  89     '\x1b[38;5;72m50\x1b[38;5;239mP', '\x1b[38;5;72m51\x1b[38;5;239mQ',
  90     '\x1b[38;5;72m52\x1b[38;5;239mR', '\x1b[38;5;72m53\x1b[38;5;239mS',
  91     '\x1b[38;5;72m54\x1b[38;5;239mT', '\x1b[38;5;72m55\x1b[38;5;239mU',
  92     '\x1b[38;5;72m56\x1b[38;5;239mV', '\x1b[38;5;72m57\x1b[38;5;239mW',
  93     '\x1b[38;5;72m58\x1b[38;5;239mX', '\x1b[38;5;72m59\x1b[38;5;239mY',
  94     '\x1b[38;5;72m5a\x1b[38;5;239mZ', '\x1b[38;5;72m5b\x1b[38;5;239m[',
  95     '\x1b[38;5;72m5c\x1b[38;5;239m\\', '\x1b[38;5;72m5d\x1b[38;5;239m]',
  96     '\x1b[38;5;72m5e\x1b[38;5;239m^', '\x1b[38;5;72m5f\x1b[38;5;239m_',
  97     '\x1b[38;5;72m60\x1b[38;5;239m`', '\x1b[38;5;72m61\x1b[38;5;239ma',
  98     '\x1b[38;5;72m62\x1b[38;5;239mb', '\x1b[38;5;72m63\x1b[38;5;239mc',
  99     '\x1b[38;5;72m64\x1b[38;5;239md', '\x1b[38;5;72m65\x1b[38;5;239me',
 100     '\x1b[38;5;72m66\x1b[38;5;239mf', '\x1b[38;5;72m67\x1b[38;5;239mg',
 101     '\x1b[38;5;72m68\x1b[38;5;239mh', '\x1b[38;5;72m69\x1b[38;5;239mi',
 102     '\x1b[38;5;72m6a\x1b[38;5;239mj', '\x1b[38;5;72m6b\x1b[38;5;239mk',
 103     '\x1b[38;5;72m6c\x1b[38;5;239ml', '\x1b[38;5;72m6d\x1b[38;5;239mm',
 104     '\x1b[38;5;72m6e\x1b[38;5;239mn', '\x1b[38;5;72m6f\x1b[38;5;239mo',
 105     '\x1b[38;5;72m70\x1b[38;5;239mp', '\x1b[38;5;72m71\x1b[38;5;239mq',
 106     '\x1b[38;5;72m72\x1b[38;5;239mr', '\x1b[38;5;72m73\x1b[38;5;239ms',
 107     '\x1b[38;5;72m74\x1b[38;5;239mt', '\x1b[38;5;72m75\x1b[38;5;239mu',
 108     '\x1b[38;5;72m76\x1b[38;5;239mv', '\x1b[38;5;72m77\x1b[38;5;239mw',
 109     '\x1b[38;5;72m78\x1b[38;5;239mx', '\x1b[38;5;72m79\x1b[38;5;239my',
 110     '\x1b[38;5;72m7a\x1b[38;5;239mz', '\x1b[38;5;72m7b\x1b[38;5;239m{',
 111     '\x1b[38;5;72m7c\x1b[38;5;239m|', '\x1b[38;5;72m7d\x1b[38;5;239m}',
 112     '\x1b[38;5;72m7e\x1b[38;5;239m~', '\x1b[38;5;246m7f ',
 113     '\x1b[38;5;246m80 ', '\x1b[38;5;246m81 ', '\x1b[38;5;246m82 ',
 114     '\x1b[38;5;246m83 ', '\x1b[38;5;246m84 ', '\x1b[38;5;246m85 ',
 115     '\x1b[38;5;246m86 ', '\x1b[38;5;246m87 ', '\x1b[38;5;246m88 ',
 116     '\x1b[38;5;246m89 ', '\x1b[38;5;246m8a ', '\x1b[38;5;246m8b ',
 117     '\x1b[38;5;246m8c ', '\x1b[38;5;246m8d ', '\x1b[38;5;246m8e ',
 118     '\x1b[38;5;246m8f ', '\x1b[38;5;246m90 ', '\x1b[38;5;246m91 ',
 119     '\x1b[38;5;246m92 ', '\x1b[38;5;246m93 ', '\x1b[38;5;246m94 ',
 120     '\x1b[38;5;246m95 ', '\x1b[38;5;246m96 ', '\x1b[38;5;246m97 ',
 121     '\x1b[38;5;246m98 ', '\x1b[38;5;246m99 ', '\x1b[38;5;246m9a ',
 122     '\x1b[38;5;246m9b ', '\x1b[38;5;246m9c ', '\x1b[38;5;246m9d ',
 123     '\x1b[38;5;246m9e ', '\x1b[38;5;246m9f ', '\x1b[38;5;246ma0 ',
 124     '\x1b[38;5;246ma1 ', '\x1b[38;5;246ma2 ', '\x1b[38;5;246ma3 ',
 125     '\x1b[38;5;246ma4 ', '\x1b[38;5;246ma5 ', '\x1b[38;5;246ma6 ',
 126     '\x1b[38;5;246ma7 ', '\x1b[38;5;246ma8 ', '\x1b[38;5;246ma9 ',
 127     '\x1b[38;5;246maa ', '\x1b[38;5;246mab ', '\x1b[38;5;246mac ',
 128     '\x1b[38;5;246mad ', '\x1b[38;5;246mae ', '\x1b[38;5;246maf ',
 129     '\x1b[38;5;246mb0 ', '\x1b[38;5;246mb1 ', '\x1b[38;5;246mb2 ',
 130     '\x1b[38;5;246mb3 ', '\x1b[38;5;246mb4 ', '\x1b[38;5;246mb5 ',
 131     '\x1b[38;5;246mb6 ', '\x1b[38;5;246mb7 ', '\x1b[38;5;246mb8 ',
 132     '\x1b[38;5;246mb9 ', '\x1b[38;5;246mba ', '\x1b[38;5;246mbb ',
 133     '\x1b[38;5;246mbc ', '\x1b[38;5;246mbd ', '\x1b[38;5;246mbe ',
 134     '\x1b[38;5;246mbf ', '\x1b[38;5;246mc0 ', '\x1b[38;5;246mc1 ',
 135     '\x1b[38;5;246mc2 ', '\x1b[38;5;246mc3 ', '\x1b[38;5;246mc4 ',
 136     '\x1b[38;5;246mc5 ', '\x1b[38;5;246mc6 ', '\x1b[38;5;246mc7 ',
 137     '\x1b[38;5;246mc8 ', '\x1b[38;5;246mc9 ', '\x1b[38;5;246mca ',
 138     '\x1b[38;5;246mcb ', '\x1b[38;5;246mcc ', '\x1b[38;5;246mcd ',
 139     '\x1b[38;5;246mce ', '\x1b[38;5;246mcf ', '\x1b[38;5;246md0 ',
 140     '\x1b[38;5;246md1 ', '\x1b[38;5;246md2 ', '\x1b[38;5;246md3 ',
 141     '\x1b[38;5;246md4 ', '\x1b[38;5;246md5 ', '\x1b[38;5;246md6 ',
 142     '\x1b[38;5;246md7 ', '\x1b[38;5;246md8 ', '\x1b[38;5;246md9 ',
 143     '\x1b[38;5;246mda ', '\x1b[38;5;246mdb ', '\x1b[38;5;246mdc ',
 144     '\x1b[38;5;246mdd ', '\x1b[38;5;246mde ', '\x1b[38;5;246mdf ',
 145     '\x1b[38;5;246me0 ', '\x1b[38;5;246me1 ', '\x1b[38;5;246me2 ',
 146     '\x1b[38;5;246me3 ', '\x1b[38;5;246me4 ', '\x1b[38;5;246me5 ',
 147     '\x1b[38;5;246me6 ', '\x1b[38;5;246me7 ', '\x1b[38;5;246me8 ',
 148     '\x1b[38;5;246me9 ', '\x1b[38;5;246mea ', '\x1b[38;5;246meb ',
 149     '\x1b[38;5;246mec ', '\x1b[38;5;246med ', '\x1b[38;5;246mee ',
 150     '\x1b[38;5;246mef ', '\x1b[38;5;246mf0 ', '\x1b[38;5;246mf1 ',
 151     '\x1b[38;5;246mf2 ', '\x1b[38;5;246mf3 ', '\x1b[38;5;246mf4 ',
 152     '\x1b[38;5;246mf5 ', '\x1b[38;5;246mf6 ', '\x1b[38;5;246mf7 ',
 153     '\x1b[38;5;246mf8 ', '\x1b[38;5;246mf9 ', '\x1b[38;5;246mfa ',
 154     '\x1b[38;5;246mfb ', '\x1b[38;5;246mfc ', '\x1b[38;5;246mfd ',
 155     '\x1b[38;5;246mfe ', '\x1b[38;5;209mff ',
 156 )
 157 
 158 # int2ascii slightly speeds up func show_ascii
 159 int2ascii = tuple(chr(i) if 32 <= i < 127 else ' ' for i in range(256))
 160 
 161 # visible noticeably speeds up func show_ascii; notice how spaces (code 32)
 162 # aren't considered visible symbols, which makes sense in func show_ascii
 163 visible = tuple(32 < i < 127 for i in range(256))
 164 
 165 
 166 def show_hex(w, src, chunk_size: int = 16) -> None:
 167     'Handle all input from the source given, emitting styled output.'
 168 
 169     # make the ruler/line-breather, which shows up every 5 hex-output lines
 170     pre = 8 * ' '
 171     pat = '           ·'
 172     pat = int(3 * chunk_size / len(pat)) * pat
 173     sep_line = f'{pre}  \x1b[38;5;245m{pat}\x1b[0m\n'
 174 
 175     # n is the current byte offset shown at the start of each display line
 176     n = 0
 177 
 178     # lines keeps track of the main output line/row count, to figure out
 179     # when to put `breather` lines
 180     lines = 0
 181 
 182     # prev remembers the previous chunk, as showing ASCII content for
 183     # 2 output-lines worth of bytes requires staying 1 step behind, so
 184     # to speak
 185     prev = src.read(chunk_size)
 186     if len(prev) == 0:
 187         return
 188 
 189     while True:
 190         chunk = src.read(chunk_size)
 191         if len(chunk) == 0:
 192             break
 193 
 194         if lines % 5 == 0 and lines > 0:
 195             w.write(sep_line)
 196         show_line(w, n, prev, chunk, chunk_size)
 197 
 198         n += len(prev)
 199         prev = chunk
 200         lines += 1
 201 
 202     # don't forget the last output line
 203     if len(prev) > 0:
 204         if lines % 5 == 0 and lines > 0:
 205             w.write(sep_line)
 206         show_line(w, n, prev, bytes(), chunk_size)
 207 
 208 
 209 def show_line(w, n: int, prev, chunk, chunk_size: int) -> None:
 210     'Help func show_hex do its job, simplifying its control flow.'
 211 
 212     # w.write(f'{n:8}  \x1b[48;5;254m')
 213     show_restyled_uint(w, n, 8)
 214     w.write('  \x1b[48;5;254m')
 215     for e in prev:
 216         w.write(bytes2styled_hex[e])
 217     w.write('\x1b[0m')
 218     show_ascii(w, prev, chunk, 3 * (chunk_size - len(prev)) + 2)
 219     w.write('\n')
 220 
 221 
 222 def show_restyled_uint(w, n: int, width: int) -> None:
 223     'Alternate styles on 3-item chunks of digits from the integer given.'
 224 
 225     digits = str(n)
 226     l = len(digits)
 227 
 228     # left-pad digits with spaces to fill the output-width given
 229     write_spaces(w, width - l)
 230 
 231     # it's quicker to just emit short-enough digit-runs verbatim
 232     if l < 4:
 233         w.write(digits)
 234         return
 235 
 236     # emit leading chunk of digits, which is the only one which
 237     # can have fewer than 3 items
 238     lead = l % 3
 239     w.write(digits[:lead])
 240 
 241     # the rest of the string now has a multiple of 3 items left
 242     start = lead
 243 
 244     # start by styling the next digit-group only if there was a
 245     # non-empty leading group at the start of the full digit-run
 246     use_style = lead > 0
 247 
 248     # alternate styles until the string is over
 249     while start < l:
 250         # the digits left are always a multiple of 3
 251         stop = start + 3
 252 
 253         if use_style:
 254             w.write('\x1b[38;5;248m')
 255             w.write(digits[start:stop])
 256             w.write('\x1b[0m')
 257         else:
 258             w.write(digits[start:stop])
 259 
 260         # switch style and advance to the next 3-digit chunk
 261         use_style = not use_style
 262         start = stop
 263 
 264 
 265 def show_ascii(w, first, second: bytes, pre: int) -> None:
 266     'Emit the ASCII side-panel for func show_hex.'
 267 
 268     # prev_vis keeps track of the previous byte's `visibility`, so spaces
 269     # are added when bytes change from non-visible-ASCII to visible-ASCII
 270     prev_vis = False
 271 
 272     is_vis = False
 273     spaces = pre
 274 
 275     # show ASCII symbols from the first `line` in the pair
 276     for e in first:
 277         is_vis = visible[e]
 278         if is_vis:
 279             if not prev_vis:
 280                 write_spaces(w, spaces)
 281                 spaces = 1
 282             w.write(int2ascii[e])
 283         prev_vis = is_vis
 284 
 285     # do the same for the second `line` in the pair
 286     for e in second:
 287         is_vis = visible[e]
 288         if is_vis:
 289             if not prev_vis:
 290                 write_spaces(w, spaces)
 291                 spaces = 1
 292             w.write(int2ascii[e])
 293         prev_vis = is_vis
 294 
 295 
 296 def write_spaces(w, n: int) -> None:
 297     'Emit the number of spaces given, minimizing `write` calls.'
 298 
 299     if n < 1:
 300         return
 301 
 302     if n < len(spaces):
 303         w.write(spaces[n])
 304         return
 305 
 306     while n >= len(spaces):
 307         w.write(spaces[-1])
 308         n -= len(spaces)
 309     w.write(spaces[n])
 310 
 311 
 312 def seems_url(s: str) -> bool:
 313     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 314     return any(s.startswith(p) for p in protocols)
 315 
 316 
 317 # args is the `proper` list of arguments given to the script
 318 args = argv[1:]
 319 
 320 # a leading help-option arg means show the help message and quit
 321 if len(args) > 0 and args[0] in ('-h', '--h', '-help', '--help'):
 322     print(info.strip(), file=stderr)
 323     exit(0)
 324 
 325 # narrow-output is to fit results in 80-column mode
 326 bytes_per_line = 16
 327 if len(args) > 0 and args[0] in ('-n', '--n', '-narrow', '--narrow'):
 328     bytes_per_line = 12
 329     args = args[1:]
 330 elif len(args) > 0:
 331     # allow a leading integer argument to set exactly how many bytes per
 332     # line to show in the styled output, before the ASCII-panel contents
 333     try:
 334         # try to parse an integer number, after turning double-dashes
 335         # into single ones, which may lead to parsed negative integers
 336         n = int(args[0].lstrip('-'))
 337         # negative integers are a result of option-style leading dashes
 338         n = int(abs(n))
 339 
 340         if n > 0:
 341             # only change the width-setting if leading number isn't zero
 342             bytes_per_line = n
 343         # don't treat a leading integer as a filepath, no matter what
 344         args = args[1:]
 345     except Exception:
 346         # avoid exceptions if leading arg isn't a valid integer
 347         pass
 348 
 349 # spaces lets func write_spaces minimize `write` operations, resulting in
 350 # noticeable speed-ups when the script deals with megabytes of data
 351 spaces = tuple(i * ' ' for i in range(3 * bytes_per_line + 4))
 352 
 353 try:
 354     if args.count('-') > 1:
 355         msg = 'reading from `-` (standard input) more than once not allowed'
 356         raise ValueError(msg)
 357 
 358     if any(seems_url(e) for e in args):
 359         from urllib.request import urlopen
 360 
 361     for i, path in enumerate(args):
 362         if i > 0:
 363             stdout.write('\n')
 364             stdout.write('\n')
 365 
 366         if path == '-':
 367             stdout.write('• - (<stdin>)\n')
 368             stdout.write('\n')
 369             show_hex(stdout, stdin.buffer, bytes_per_line)
 370             continue
 371 
 372         if seems_url(path):
 373             with urlopen(path) as inp:
 374                 stdout.write(f'{path}\n')
 375                 stdout.write('\n')
 376                 show_hex(stdout, inp, bytes_per_line)
 377             continue
 378 
 379         with open(path, mode='rb', buffering=4_960) as inp:
 380             n = fstat(inp.fileno()).st_size
 381             stdout.write(f'{path}  \x1b[38;5;245m({n:,} bytes)\x1b[0m\n')
 382             stdout.write('\n')
 383             show_hex(stdout, inp, bytes_per_line)
 384 
 385     if len(args) == 0:
 386         stdout.write('• <stdin>\n')
 387         stdout.write('\n')
 388         show_hex(stdout, stdin.buffer, bytes_per_line)
 389 except BrokenPipeError:
 390     # quit quietly, instead of showing a confusing error message
 391     stderr.close()
 392 except KeyboardInterrupt:
 393     exit(2)
 394 except Exception as e:
 395     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 396     exit(1)