File: dedup.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath dedup.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "errors"
  37     "io"
  38     "os"
  39 )
  40 
  41 // Note: the code is avoiding using the fmt package to save hundreds of
  42 // kilobytes on the resulting executable, which is a noticeable difference.
  43 
  44 const info = `
  45 dedup [options...] [file...]
  46 
  47 
  48 DEDUPlicate lines prevents the same line from appearing again in the output,
  49 after the first time. Unique lines are remembered across inputs.
  50 
  51 Input is assumed to be UTF-8, and all CRLF byte-pairs are turned into line
  52 feeds by default.
  53 
  54 All (optional) leading options start with either single or double-dash:
  55 
  56     -cr         keep trailing carriage-returns from input lines
  57 
  58     -h          show this help message
  59     -help       show this help message
  60 `
  61 
  62 // errNoMoreOutput is a dummy error whose message is ignored, and which
  63 // causes the app to quit immediately and successfully
  64 var errNoMoreOutput = errors.New(`no more output`)
  65 
  66 func main() {
  67     cr := false
  68     args := os.Args[1:]
  69 
  70     if len(args) > 0 {
  71         switch args[0] {
  72         case `-h`, `--h`, `-help`, `--help`:
  73             os.Stderr.WriteString(info[1:])
  74             return
  75 
  76         case `-cr`, `--cr`:
  77             cr = true
  78             args = args[1:]
  79         }
  80     }
  81 
  82     if len(args) > 0 && args[0] == `--` {
  83         args = args[1:]
  84     }
  85 
  86     var cfg config
  87     cfg.Seen = make(map[string]struct{})
  88     cfg.KeepCR = cr
  89     cfg.LiveLines = true
  90     if _, err := os.Stdout.Seek(0, io.SeekCurrent); err == nil {
  91         cfg.LiveLines = false
  92     }
  93 
  94     if err := run(os.Stdout, args, cfg); isActualError(err) {
  95         os.Stderr.WriteString(err.Error())
  96         os.Stderr.WriteString("\n")
  97         os.Exit(1)
  98     }
  99 }
 100 
 101 type config struct {
 102     Seen      map[string]struct{}
 103     KeepCR    bool
 104     LiveLines bool
 105 }
 106 
 107 func run(w io.Writer, args []string, cfg config) error {
 108     // f, _ := os.Create(`dedup.prof`)
 109     // defer f.Close()
 110     // pprof.StartCPUProfile(f)
 111     // defer pprof.StopCPUProfile()
 112     files := make(map[string]struct{})
 113     bw := bufio.NewWriter(w)
 114     defer bw.Flush()
 115 
 116     for _, name := range args {
 117         if _, ok := files[name]; ok {
 118             continue
 119         }
 120         files[name] = struct{}{}
 121 
 122         if err := handleFile(bw, name, cfg); err != nil {
 123             return err
 124         }
 125     }
 126 
 127     if len(args) == 0 {
 128         return dedup(bw, os.Stdin, cfg)
 129     }
 130     return nil
 131 }
 132 
 133 func handleFile(w *bufio.Writer, name string, cfg config) error {
 134     if name == `` || name == `-` {
 135         return dedup(w, os.Stdin, cfg)
 136     }
 137 
 138     f, err := os.Open(name)
 139     if err != nil {
 140         return errors.New(`can't read from file named "` + name + `"`)
 141     }
 142     defer f.Close()
 143 
 144     return dedup(w, f, cfg)
 145 }
 146 
 147 // isActualError is to figure out whether not to ignore an error, and thus
 148 // show it as an error message
 149 func isActualError(err error) bool {
 150     return err != nil && err != errNoMoreOutput
 151 }
 152 
 153 func dedup(w *bufio.Writer, r io.Reader, cfg config) error {
 154     const gb = 1024 * 1024 * 1024
 155     sc := bufio.NewScanner(r)
 156     sc.Buffer(nil, 8*gb)
 157     if cfg.KeepCR {
 158         sc.Split(splitKeepCR)
 159     }
 160 
 161     seen := cfg.Seen
 162     live := cfg.LiveLines
 163 
 164     for sc.Scan() {
 165         line := sc.Text()
 166         if _, ok := seen[line]; ok {
 167             continue
 168         }
 169         seen[line] = struct{}{}
 170 
 171         w.Write(sc.Bytes())
 172         if w.WriteByte('\n') != nil {
 173             return errNoMoreOutput
 174         }
 175 
 176         if !live {
 177             continue
 178         }
 179 
 180         if err := w.Flush(); err != nil {
 181             return errNoMoreOutput
 182         }
 183     }
 184 
 185     return sc.Err()
 186 }
 187 
 188 func splitKeepCR(data []byte, eof bool) (move int, token []byte, err error) {
 189     i, j := indexLineEnd(data)
 190 
 191     if !eof {
 192         if i < 0 {
 193             return 0, nil, nil
 194         }
 195         return j, data[:i], nil
 196     }
 197 
 198     if len(data) == 0 {
 199         return 0, nil, nil
 200     }
 201 
 202     if i < 0 {
 203         return len(data), data, nil
 204     }
 205     return len(data), data[:i], nil
 206 }
 207 
 208 func indexLineEnd(data []byte) (int, int) {
 209     var prev byte
 210 
 211     for i, b := range data {
 212         if b == '\n' {
 213             if prev == '\r' {
 214                 return i - 1, i + 1
 215             }
 216             return i, i + 1
 217         }
 218         prev = b
 219     }
 220 
 221     return -1, -1
 222 }