File: hima.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from io import SEEK_CUR
  27 from re import compile, Match, Pattern, IGNORECASE
  28 from sys import argv, exit, maxsize, stderr, stdin, stdout
  29 from typing import List
  30 
  31 
  32 info = '''
  33 hima [options...] [regexes...]
  34 
  35 
  36 HIlight MAtches ANSI-styles matching regular expressions along lines read
  37 from the standard input. The regular-expression mode used is a superset of
  38 the commonly-used "extended-mode".
  39 
  40 Regexes always avoid matching any ANSI-style sequences, to avoid messing
  41 those up. Also, multiple matches in a line never overlap: at each step
  42 along a line, the earliest-starting match among the regexes always wins,
  43 as the order regexes are given among the arguments never matters.
  44 
  45 The options are, available both in single and double-dash versions
  46 
  47     -h, -help     show this help message
  48     -i, -ins      match regexes case-insensitively
  49     -l, -links    add a case-insensitive regex to match HTTP/HTTPS links
  50 '''
  51 
  52 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  53     print(info.strip(), file=stderr)
  54     exit(0)
  55 
  56 # ansi_re matches ANSI-style sequences, so they're only matched `around`
  57 ansi_re = compile('\x1b\\[[0-9;]*[A-Za-z]')
  58 
  59 
  60 def match(src: str, start: int, stop: int, regexes: List[Pattern]) -> Match:
  61     first = None
  62     for expr in regexes:
  63         m = expr.search(src, start, stop)
  64         if (not first) or (m and m.start() < first.start()):
  65             first = m
  66     return first
  67 
  68 
  69 def style_line(w, s: str, regexes: List[Pattern], ansi_re: Pattern) -> None:
  70     # start is used outside the regex-match loop to handle trailing parts
  71     # in lines
  72     start = 0
  73 
  74     # replace all regex-matches on the line by surrounding each matched
  75     # substring with ANSI styles/resets
  76     while True:
  77         m = ansi_re.search(s, start)
  78         if not m:
  79             start = style_chunk(w, s, start, maxsize, regexes)
  80             break
  81 
  82         stop = m.start()
  83         start = style_chunk(w, s, start, stop, regexes)
  84         # don't forget the last part of the line, or the whole line
  85         stop = m.end()
  86         w.write(s[start:stop])
  87         start = stop
  88 
  89     # don't forget the last part of the line, or the whole line
  90     w.write(s[start:])
  91     w.write('\n')
  92 
  93 
  94 def style_chunk(w, s: str, start: int, stop: int, pats: List[Pattern]) -> int:
  95     while True:
  96         m = match(s, start, stop, pats)
  97         if not m:
  98             return start
  99 
 100         i = m.start()
 101         j = m.end()
 102 
 103         # part before match
 104         w.write(s[start:i])
 105 
 106         # current match
 107         w.write('\x1b[7m')
 108         w.write(s[i:j])
 109         w.write('\x1b[0m')
 110 
 111         # the end of the match is the start of the `rest` of the string
 112         start = j
 113 
 114 
 115 try:
 116     stdout.seek(0, SEEK_CUR)
 117     live = False
 118 except:
 119     live = True
 120 
 121 flags = 0
 122 args = argv[1:]
 123 find_links = False
 124 
 125 while len(args):
 126     if args[0] in ('-i', '--i', '-ins', '--ins'):
 127         args = args[1:]
 128         flags = IGNORECASE
 129         continue
 130     if args[0] in ('-l', '--l', '-links', '--links'):
 131         args = args[1:]
 132         find_links = True
 133         continue
 134     if args[0] == '--':
 135         args = args[1:]
 136         break
 137     break
 138 
 139 try:
 140     regexes = [compile(s, flags=flags) for s in args]
 141     if find_links:
 142         links = 'https?://[A-Za-z0-9+_.:%-]+(/[A-Za-z0-9+_.%/,#?&=-]*)*'
 143         regexes.append(compile(links, flags=IGNORECASE))
 144 
 145     for line in stdin:
 146         # ignore trailing carriage-returns and/or line-feeds in input lines
 147         line = line.rstrip('\r\n').rstrip('\n')
 148         style_line(stdout, line, regexes, ansi_re)
 149         if live:
 150             stdout.flush()
 151 except KeyboardInterrupt:
 152     exit(2)
 153 except Exception as e:
 154     print(str(e), file=stderr)
 155     exit(1)