File: hima.py
   1 #!/usr/bin/python
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2026 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from io import SEEK_CUR
  27 from re import compile, Match, Pattern, IGNORECASE
  28 from sys import argv, exit, maxsize, stderr, stdin, stdout
  29 from typing import List
  30 
  31 
  32 info = '''
  33 hima [options...] [regexes...]
  34 
  35 
  36 HIlight MAtches ANSI-styles matching regular expressions along lines read
  37 from the standard input. The regular-expression mode used is a superset of
  38 the commonly-used "extended-mode".
  39 
  40 Regexes always avoid matching any ANSI-style sequences, to avoid messing
  41 those up. Also, multiple matches in a line never overlap: at each step
  42 along a line, the earliest-starting match among the regexes always wins,
  43 as the order regexes are given among the arguments never matters.
  44 
  45 The options are, available both in single and double-dash versions
  46 
  47     -h, -help     show this help message
  48     -i, -ins      match regexes case-insensitively
  49     -l, -links    add a case-insensitive regex to match HTTP/HTTPS links
  50 '''
  51 
  52 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  53     print(info.strip(), file=stderr)
  54     exit(0)
  55 
  56 # ansi_re matches ANSI-style sequences, so they're only matched `around`
  57 ansi_re = compile('\x1b\\[[0-9;]*[A-Za-z]')
  58 
  59 
  60 def match(src: str, start: int, stop: int, regexes: List[Pattern]) -> Match:
  61     first = None
  62     for expr in regexes:
  63         m = expr.search(src, start, stop)
  64         if not m or m.start() == m.end():
  65             continue
  66         if not first or m.start() < first.start():
  67             first = m
  68     return first
  69 
  70 
  71 def style_line(w, s: str, regexes: List[Pattern], ansi_re: Pattern) -> None:
  72     # start is used outside the regex-match loop to handle trailing parts
  73     # in lines
  74     start = 0
  75 
  76     # replace all regex-matches on the line by surrounding each matched
  77     # substring with ANSI styles/resets
  78     while True:
  79         m = ansi_re.search(s, start)
  80         if not m:
  81             start = style_chunk(w, s, start, maxsize, regexes)
  82             break
  83 
  84         stop = m.start()
  85         start = style_chunk(w, s, start, stop, regexes)
  86         # don't forget the last part of the line, or the whole line
  87         stop = m.end()
  88         w.write(s[start:stop])
  89         start = stop
  90 
  91     # don't forget the last part of the line, or the whole line
  92     w.write(s[start:])
  93     w.write('\n')
  94 
  95 
  96 def style_chunk(w, s: str, start: int, stop: int, pats: List[Pattern]) -> int:
  97     while True:
  98         m = match(s, start, stop, pats)
  99         if not m:
 100             return start
 101 
 102         i = m.start()
 103         j = m.end()
 104 
 105         # part before match
 106         w.write(s[start:i])
 107 
 108         # current match
 109         w.write('\x1b[7m')
 110         w.write(s[i:j])
 111         w.write('\x1b[0m')
 112 
 113         # the end of the match is the start of the `rest` of the string
 114         start = j
 115 
 116 
 117 try:
 118     stdout.seek(0, SEEK_CUR)
 119     live = False
 120 except:
 121     live = True
 122 
 123 flags = 0
 124 args = argv[1:]
 125 find_links = False
 126 
 127 while len(args):
 128     if args[0] in ('-i', '--i', '-ins', '--ins'):
 129         args = args[1:]
 130         flags = IGNORECASE
 131         continue
 132     if args[0] in ('-l', '--l', '-links', '--links'):
 133         args = args[1:]
 134         find_links = True
 135         continue
 136     if args[0] == '--':
 137         args = args[1:]
 138         break
 139     break
 140 
 141 try:
 142     regexes = [compile(s, flags=flags) for s in args]
 143     if find_links:
 144         links = 'https?://[A-Za-z0-9+_.:%-]+(/[A-Za-z0-9+_.%/,#?&=-]*)*'
 145         regexes.append(compile(links, flags=IGNORECASE))
 146 
 147     for line in stdin:
 148         line = line.rstrip('\r\n').rstrip('\n')
 149         style_line(stdout, line, regexes, ansi_re)
 150         if live:
 151             stdout.flush()
 152 except KeyboardInterrupt:
 153     exit(2)
 154 except Exception as e:
 155     print(str(e), file=stderr)
 156     exit(1)