File: hima.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath hima.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "bytes"
  37     "io"
  38     "os"
  39     "regexp"
  40     "strings"
  41 )
  42 
  43 const info = `
  44 hima [options...] [regexes...]
  45 
  46 
  47 HIlight MAtches ANSI-styles matching regular expressions along lines read
  48 from the standard input. The regular-expression mode used is "re2", which
  49 is a superset of the commonly-used "extended-mode".
  50 
  51 Regexes always avoid matching any ANSI-style sequences, to avoid messing
  52 those up. Also, multiple matches in a line never overlap: at each step
  53 along a line, the earliest-starting match among the regexes always wins,
  54 as the order regexes are given among the arguments never matters.
  55 
  56 The options are, available both in single and double-dash versions
  57 
  58     -h, -help      show this help message
  59     -f, -filter    filter out (ignore) lines with no matches
  60     -i, -ins       match regexes case-insensitively
  61 `
  62 
  63 const highlightStyle = "\x1b[7m"
  64 
  65 func main() {
  66     filter := false
  67     buffered := false
  68     insensitive := false
  69     args := os.Args[1:]
  70 
  71     for len(args) > 0 {
  72         switch args[0] {
  73         case `-b`, `--b`, `-buffered`, `--buffered`:
  74             buffered = true
  75             args = args[1:]
  76 
  77         case `-f`, `--f`, `-filter`, `--filter`:
  78             filter = true
  79             args = args[1:]
  80 
  81         case `-h`, `--h`, `-help`, `--help`:
  82             os.Stdout.WriteString(info[1:])
  83             return
  84 
  85         case `-i`, `--i`, `-ins`, `--ins`:
  86             insensitive = true
  87             args = args[1:]
  88         }
  89 
  90         break
  91     }
  92 
  93     if len(args) > 0 && args[0] == `--` {
  94         args = args[1:]
  95     }
  96 
  97     patterns := make([]pattern, 0, len(args))
  98 
  99     for _, s := range args {
 100         var err error
 101         var pat pattern
 102 
 103         if insensitive {
 104             pat, err = compile(`(?i)` + s)
 105         } else {
 106             pat, err = compile(s)
 107         }
 108 
 109         if err != nil {
 110             os.Stderr.WriteString(err.Error())
 111             os.Stderr.WriteString("\n")
 112             continue
 113         }
 114 
 115         patterns = append(patterns, pat)
 116     }
 117 
 118     // quit right away when given invalid regexes
 119     if len(patterns) < len(args) {
 120         os.Exit(1)
 121     }
 122 
 123     liveLines := !buffered
 124     if !buffered {
 125         if _, err := os.Stdout.Seek(0, io.SeekCurrent); err == nil {
 126             liveLines = false
 127         }
 128     }
 129 
 130     err := run(os.Stdout, os.Stdin, patterns, filter, liveLines)
 131     if err != nil && err != io.EOF {
 132         os.Stderr.WriteString(err.Error())
 133         os.Stderr.WriteString("\n")
 134         os.Exit(1)
 135     }
 136 }
 137 
 138 // pattern is a regular-expression pattern which distinguishes between the
 139 // start/end of a line and those of the chunks it can be used to match
 140 type pattern struct {
 141     // expr is the regular-expression
 142     expr *regexp.Regexp
 143 
 144     // begin is whether the regexp refers to the start of a line
 145     begin bool
 146 
 147     // end is whether the regexp refers to the end of a line
 148     end bool
 149 }
 150 
 151 func compile(src string) (pattern, error) {
 152     expr, err := regexp.Compile(src)
 153 
 154     var pat pattern
 155     pat.expr = expr
 156     pat.begin = strings.HasPrefix(src, `^`) || strings.HasPrefix(src, `(?i)^`)
 157     pat.end = strings.HasSuffix(src, `$`) && !strings.HasSuffix(src, `\$`)
 158     return pat, err
 159 }
 160 
 161 func (p pattern) findIndex(s []byte, i int, last int) (start int, stop int) {
 162     if i > 0 && p.begin {
 163         return -1, -1
 164     }
 165     if i != last && p.end {
 166         return -1, -1
 167     }
 168 
 169     span := p.expr.FindIndex(s)
 170     // also ignore empty regex matches to avoid infinite outer loops,
 171     // as skipping empty slices isn't advancing at all, leaving the
 172     // string stuck to being empty-matched forever by the same regex
 173     if len(span) != 2 || span[0] == span[1] {
 174         return -1, -1
 175     }
 176 
 177     return span[0], span[1]
 178 }
 179 
 180 func run(w io.Writer, r io.Reader, pats []pattern, filter, live bool) error {
 181     sc := bufio.NewScanner(r)
 182     sc.Buffer(nil, 8*1024*1024*1024)
 183     bw := bufio.NewWriter(w)
 184     defer bw.Flush()
 185 
 186     for i := 0; sc.Scan(); i++ {
 187         s := sc.Bytes()
 188         if i == 0 && bytes.HasPrefix(s, []byte{0xef, 0xbb, 0xbf}) {
 189             s = s[3:]
 190         }
 191 
 192         n := 0
 193         last := countChunks(s) - 1
 194         if last < 0 {
 195             last = 0
 196         }
 197 
 198         if filter && !matches(s, pats, last) {
 199             continue
 200         }
 201 
 202         for len(s) > 0 {
 203             i, j := indexEscapeSequence(s)
 204             if i < 0 {
 205                 handleChunk(bw, s, pats, n, last)
 206                 break
 207             }
 208             if j < 0 {
 209                 j = len(s)
 210             }
 211 
 212             handleChunk(bw, s[:i], pats, n, last)
 213             if i > 0 {
 214                 n++
 215             }
 216 
 217             bw.Write(s[i:j])
 218 
 219             s = s[j:]
 220         }
 221 
 222         if bw.WriteByte('\n') != nil {
 223             return io.EOF
 224         }
 225 
 226         if !live {
 227             continue
 228         }
 229 
 230         if bw.Flush() != nil {
 231             return io.EOF
 232         }
 233     }
 234 
 235     return sc.Err()
 236 }
 237 
 238 // matches finds out if any regex matches any substring around ANSI-sequences
 239 func matches(s []byte, patterns []pattern, last int) bool {
 240     n := 0
 241 
 242     for len(s) > 0 {
 243         i, j := indexEscapeSequence(s)
 244         if i < 0 {
 245             for _, p := range patterns {
 246                 if begin, _ := p.findIndex(s, n, last); begin >= 0 {
 247                     return true
 248                 }
 249             }
 250             return false
 251         }
 252 
 253         if j < 0 {
 254             j = len(s)
 255         }
 256 
 257         for _, p := range patterns {
 258             if begin, _ := p.findIndex(s[:i], n, last); begin >= 0 {
 259                 return true
 260             }
 261         }
 262 
 263         if i > 0 {
 264             n++
 265         }
 266 
 267         s = s[j:]
 268     }
 269 
 270     return false
 271 }
 272 
 273 func countChunks(s []byte) int {
 274     chunks := 0
 275 
 276     for len(s) > 0 {
 277         i, j := indexEscapeSequence(s)
 278         if i < 0 {
 279             break
 280         }
 281 
 282         if i > 0 {
 283             chunks++
 284         }
 285 
 286         if j < 0 {
 287             break
 288         }
 289         s = s[j:]
 290     }
 291 
 292     if len(s) > 0 {
 293         chunks++
 294     }
 295     return chunks
 296 }
 297 
 298 // indexEscapeSequence finds the first ANSI-style escape-sequence, which is
 299 // the multi-byte sequences starting with ESC[; the result is a pair of slice
 300 // indices which can be independently negative when either the start/end of
 301 // a sequence isn't found; given their fairly-common use, even the hyperlink
 302 // ESC]8 sequences are supported
 303 func indexEscapeSequence(s []byte) (int, int) {
 304     var prev byte
 305 
 306     for i, b := range s {
 307         if prev == '\x1b' && b == '[' {
 308             j := indexLetter(s[i+1:])
 309             if j < 0 {
 310                 return i, -1
 311             }
 312             return i - 1, i + 1 + j + 1
 313         }
 314 
 315         if prev == '\x1b' && b == ']' && i+1 < len(s) && s[i+1] == '8' {
 316             j := indexPair(s[i+1:], '\x1b', '\\')
 317             if j < 0 {
 318                 return i, -1
 319             }
 320             return i - 1, i + 1 + j + 2
 321         }
 322 
 323         prev = b
 324     }
 325 
 326     return -1, -1
 327 }
 328 
 329 func indexLetter(s []byte) int {
 330     for i, b := range s {
 331         upper := b &^ 32
 332         if 'A' <= upper && upper <= 'Z' {
 333             return i
 334         }
 335     }
 336 
 337     return -1
 338 }
 339 
 340 func indexPair(s []byte, x byte, y byte) int {
 341     var prev byte
 342 
 343     for i, b := range s {
 344         if prev == x && b == y && i > 0 {
 345             return i
 346         }
 347         prev = b
 348     }
 349 
 350     return -1
 351 }
 352 
 353 // note: looking at the results of restoring ANSI-styles after style-resets
 354 // doesn't seem to be worth it, as a previous version used to do
 355 
 356 // handleChunk handles line-slices around any detected ANSI-style sequences,
 357 // or even whole lines, when no ANSI-styles are found in them
 358 func handleChunk(w *bufio.Writer, s []byte, with []pattern, n int, last int) {
 359     for len(s) > 0 {
 360         start, end := -1, -1
 361         for _, p := range with {
 362             i, j := p.findIndex(s, n, last)
 363             if i >= 0 && (i < start || start < 0) {
 364                 start, end = i, j
 365             }
 366         }
 367 
 368         if start < 0 {
 369             w.Write(s)
 370             return
 371         }
 372 
 373         w.Write(s[:start])
 374         w.WriteString(highlightStyle)
 375         w.Write(s[start:end])
 376         w.WriteString("\x1b[0m")
 377 
 378         s = s[end:]
 379     }
 380 }