File: frep.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 # frep [options...] [regex] [filepaths/URIs...]
  27 #
  28 # Flat Regular-Expression Print(er) shows all regex-matches on binary data,
  29 # without being limited by end-of-line-style byte-sequences.
  30 #
  31 # Matches are always shown ANSI-styled, starting with the data-source name
  32 # (a filepath, or URI), the 0-based byte-offset range of the match, a few
  33 # preceding bytes (unstyled), the bytes matched (styled), and a few following
  34 # bytes (also unstyled), the surrounding unstyled bytes shown for context.
  35 #
  36 # All (optional) leading options start with either single or double-dash,
  37 # and most of them change the style/color used. Some of the options are,
  38 # shown in their single-dash form:
  39 #
  40 #     -h          show this help message
  41 #     -help       show this help message
  42 #
  43 #     -i          case-insensitive matching of ASCII letters
  44 
  45 
  46 from re import compile as compile_re, IGNORECASE, Pattern
  47 from sys import argv, exit, stderr, stdin, stdout
  48 from urllib.request import urlopen
  49 
  50 
  51 # info is the help message shown when asked to
  52 info = '''
  53 frep [options...] [regex] [filepaths/URIs...]
  54 
  55 Flat Regular-Expression Print(er) shows all regex-matches on binary data,
  56 without being limited by end-of-line-style byte-sequences.
  57 
  58 Matches are always shown ANSI-styled, starting with the data-source name
  59 (a filepath, or URI), the 0-based byte-offset range of the match, a few
  60 preceding bytes (unstyled), the bytes matched (styled), and a few following
  61 bytes (also unstyled), the surrounding unstyled bytes shown for context.
  62 
  63 All (optional) leading options start with either single or double-dash,
  64 and most of them change the style/color used. Some of the options are,
  65 shown in their single-dash form:
  66 
  67     -h          show this help message
  68     -help       show this help message
  69 
  70     -i          case-insensitive matching of ASCII letters
  71 '''.strip()
  72 
  73 # handle standard help cmd-line options, quitting right away in that case
  74 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  75     print(info, file=stderr)
  76     exit(0)
  77 
  78 
  79 def seems_url(s: str) -> bool:
  80     for prot in ('https://', 'http://', 'file://', 'ftp://', 'data:'):
  81         if s.startswith(prot):
  82             return True
  83     return False
  84 
  85 
  86 def handle_input(w, name: str, data: bytes, expr: Pattern) -> None:
  87     '''Show all regex-matches using ANSI-styles'''
  88 
  89     padding = 20
  90     # match_style = b'\x1b[42m\x1b[97m'
  91     match_style = b'\x1b[48;5;29m\x1b[97m'
  92 
  93     for m in expr.finditer(data):
  94         i = m.start()
  95         j = m.end()
  96 
  97         w.write(f'\x1b[35m{name} \x1b[34m[{i:09}:{j:09}]'.encode('utf-8'))
  98         if i >= padding:
  99             w.write(b'\x1b[0m')
 100             w.write(data[(i - padding):i])
 101         w.write(match_style)
 102         w.write(data[i:j])
 103         if j < len(data) - padding:
 104             w.write(b'\x1b[0m')
 105             w.write(data[j:(j + padding)])
 106         w.write(b'\x1b[0m\n')
 107 
 108 try:
 109     start_args = 2
 110     re_mode = 0 # NOFLAG
 111     if len(argv) > 1 and argv[1] in ('-i', '--i'):
 112         re_mode = IGNORECASE
 113         start_args = 3
 114 
 115     if len(argv) < start_args:
 116         raise ValueError('no regex given')
 117 
 118     expr = compile_re(argv[start_args - 1].encode('utf-8'), flags=re_mode)
 119     args = argv[start_args:]
 120 
 121     if args.count('-') > 1:
 122         msg = 'reading from `-` (standard input) more than once not allowed'
 123         raise ValueError(msg)
 124 
 125     # handle all named inputs given
 126     for path in args:
 127         if path == '-':
 128             data = stdin.buffer.read()
 129             handle_input(stdout.buffer, path, data, expr)
 130             data = bytes()
 131             continue
 132 
 133         if seems_url(path):
 134             with urlopen(path) as inp:
 135                 data = stdin.buffer.read()
 136             handle_input(stdout.buffer, path, data, expr)
 137             data = bytes()
 138             continue
 139 
 140         with open(path, 'rb') as inp:
 141             data = inp.read()
 142         handle_input(stdout.buffer, path, data, expr)
 143         data = bytes()
 144 
 145     # when no filenames are given, handle lines from stdin
 146     if len(args) == 0:
 147         data = stdin.buffer.read()
 148         handle_input(stdout.buffer, '-', data, expr)
 149         data = bytes()
 150 except (BrokenPipeError, KeyboardInterrupt):
 151     # quit quietly, instead of showing a confusing error message
 152     stderr.flush()
 153     stderr.close()
 154 except Exception as e:
 155     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 156     exit(1)