File: frep.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 # frep [options...] [regex] [filepaths/URIs...] 27 # 28 # Flat Regular-Expression Print(er) shows all regex-matches on binary data, 29 # without being limited by end-of-line-style byte-sequences. 30 # 31 # Matches are always shown ANSI-styled, starting with the data-source name 32 # (a filepath, or URI), the 0-based byte-offset range of the match, a few 33 # preceding bytes (unstyled), the bytes matched (styled), and a few following 34 # bytes (also unstyled), the surrounding unstyled bytes shown for context. 35 # 36 # All (optional) leading options start with either single or double-dash, 37 # and most of them change the style/color used. Some of the options are, 38 # shown in their single-dash form: 39 # 40 # -h show this help message 41 # -help show this help message 42 # 43 # -i case-insensitive matching of ASCII letters 44 45 46 from re import compile as compile_re, IGNORECASE, Pattern 47 from sys import argv, exit, stderr, stdin, stdout 48 from urllib.request import urlopen 49 50 51 # info is the help message shown when asked to 52 info = ''' 53 frep [options...] [regex] [filepaths/URIs...] 54 55 Flat Regular-Expression Print(er) shows all regex-matches on binary data, 56 without being limited by end-of-line-style byte-sequences. 57 58 Matches are always shown ANSI-styled, starting with the data-source name 59 (a filepath, or URI), the 0-based byte-offset range of the match, a few 60 preceding bytes (unstyled), the bytes matched (styled), and a few following 61 bytes (also unstyled), the surrounding unstyled bytes shown for context. 62 63 All (optional) leading options start with either single or double-dash, 64 and most of them change the style/color used. Some of the options are, 65 shown in their single-dash form: 66 67 -h show this help message 68 -help show this help message 69 70 -i case-insensitive matching of ASCII letters 71 '''.strip() 72 73 # handle standard help cmd-line options, quitting right away in that case 74 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 75 print(info, file=stderr) 76 exit(0) 77 78 79 def seems_url(s: str) -> bool: 80 for prot in ('https://', 'http://', 'file://', 'ftp://', 'data:'): 81 if s.startswith(prot): 82 return True 83 return False 84 85 86 def handle_input(w, name: str, data: bytes, expr: Pattern) -> None: 87 '''Show all regex-matches using ANSI-styles''' 88 89 padding = 20 90 # match_style = b'\x1b[42m\x1b[97m' 91 match_style = b'\x1b[48;5;29m\x1b[97m' 92 93 for m in expr.finditer(data): 94 i = m.start() 95 j = m.end() 96 97 w.write(f'\x1b[35m{name} \x1b[34m[{i:09}:{j:09}]'.encode('utf-8')) 98 if i >= padding: 99 w.write(b'\x1b[0m') 100 w.write(data[(i - padding):i]) 101 w.write(match_style) 102 w.write(data[i:j]) 103 if j < len(data) - padding: 104 w.write(b'\x1b[0m') 105 w.write(data[j:(j + padding)]) 106 w.write(b'\x1b[0m\n') 107 108 try: 109 start_args = 2 110 re_mode = 0 # NOFLAG 111 if len(argv) > 1 and argv[1] in ('-i', '--i'): 112 re_mode = IGNORECASE 113 start_args = 3 114 115 if len(argv) < start_args: 116 raise ValueError('no regex given') 117 118 expr = compile_re(argv[start_args - 1].encode('utf-8'), flags=re_mode) 119 args = argv[start_args:] 120 121 if args.count('-') > 1: 122 msg = 'reading from `-` (standard input) more than once not allowed' 123 raise ValueError(msg) 124 125 # handle all named inputs given 126 for path in args: 127 if path == '-': 128 data = stdin.buffer.read() 129 handle_input(stdout.buffer, path, data, expr) 130 data = bytes() 131 continue 132 133 if seems_url(path): 134 with urlopen(path) as inp: 135 data = stdin.buffer.read() 136 handle_input(stdout.buffer, path, data, expr) 137 data = bytes() 138 continue 139 140 with open(path, 'rb') as inp: 141 data = inp.read() 142 handle_input(stdout.buffer, path, data, expr) 143 data = bytes() 144 145 # when no filenames are given, handle lines from stdin 146 if len(args) == 0: 147 data = stdin.buffer.read() 148 handle_input(stdout.buffer, '-', data, expr) 149 data = bytes() 150 except (BrokenPipeError, KeyboardInterrupt): 151 # quit quietly, instead of showing a confusing error message 152 stderr.flush() 153 stderr.close() 154 except Exception as e: 155 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 156 exit(1)