File: dedup.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath dedup.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "errors" 37 "io" 38 "os" 39 ) 40 41 // Note: the code is avoiding using the fmt package to save hundreds of 42 // kilobytes on the resulting executable, which is a noticeable difference. 43 44 const info = ` 45 dedup [options...] [file...] 46 47 48 DEDUPlicate lines prevents the same line from appearing again in the output, 49 after the first time. Unique lines are remembered across inputs. 50 51 Input is assumed to be UTF-8, and all CRLF byte-pairs are turned into line 52 feeds by default. 53 54 All (optional) leading options start with either single or double-dash: 55 56 -cr keep trailing carriage-returns from input lines 57 58 -h show this help message 59 -help show this help message 60 ` 61 62 // errNoMoreOutput is a dummy error whose message is ignored, and which 63 // causes the app to quit immediately and successfully 64 var errNoMoreOutput = errors.New(`no more output`) 65 66 func main() { 67 cr := false 68 args := os.Args[1:] 69 70 if len(args) > 0 { 71 switch args[0] { 72 case `-h`, `--h`, `-help`, `--help`: 73 os.Stderr.WriteString(info[1:]) 74 return 75 76 case `-cr`, `--cr`: 77 cr = true 78 args = args[1:] 79 } 80 } 81 82 if len(args) > 0 && args[0] == `--` { 83 args = args[1:] 84 } 85 86 var cfg config 87 cfg.Seen = make(map[string]struct{}) 88 cfg.KeepCR = cr 89 cfg.LiveLines = true 90 if _, err := os.Stdout.Seek(0, io.SeekCurrent); err == nil { 91 cfg.LiveLines = false 92 } 93 94 if err := run(os.Stdout, args, cfg); isActualError(err) { 95 os.Stderr.WriteString(err.Error()) 96 os.Stderr.WriteString("\n") 97 os.Exit(1) 98 } 99 } 100 101 type config struct { 102 Seen map[string]struct{} 103 KeepCR bool 104 LiveLines bool 105 } 106 107 func run(w io.Writer, args []string, cfg config) error { 108 // f, _ := os.Create(`dedup.prof`) 109 // defer f.Close() 110 // pprof.StartCPUProfile(f) 111 // defer pprof.StopCPUProfile() 112 files := make(map[string]struct{}) 113 bw := bufio.NewWriter(w) 114 defer bw.Flush() 115 116 for _, name := range args { 117 if _, ok := files[name]; ok { 118 continue 119 } 120 files[name] = struct{}{} 121 122 if err := handleFile(bw, name, cfg); err != nil { 123 return err 124 } 125 } 126 127 if len(args) == 0 { 128 return dedup(bw, os.Stdin, cfg) 129 } 130 return nil 131 } 132 133 func handleFile(w *bufio.Writer, name string, cfg config) error { 134 if name == `` || name == `-` { 135 return dedup(w, os.Stdin, cfg) 136 } 137 138 f, err := os.Open(name) 139 if err != nil { 140 return errors.New(`can't read from file named "` + name + `"`) 141 } 142 defer f.Close() 143 144 return dedup(w, f, cfg) 145 } 146 147 // isActualError is to figure out whether not to ignore an error, and thus 148 // show it as an error message 149 func isActualError(err error) bool { 150 return err != nil && err != errNoMoreOutput 151 } 152 153 func dedup(w *bufio.Writer, r io.Reader, cfg config) error { 154 const gb = 1024 * 1024 * 1024 155 sc := bufio.NewScanner(r) 156 sc.Buffer(nil, 8*gb) 157 if cfg.KeepCR { 158 sc.Split(splitKeepCR) 159 } 160 161 seen := cfg.Seen 162 live := cfg.LiveLines 163 164 for sc.Scan() { 165 line := sc.Text() 166 if _, ok := seen[line]; ok { 167 continue 168 } 169 seen[line] = struct{}{} 170 171 w.Write(sc.Bytes()) 172 if w.WriteByte('\n') != nil { 173 return errNoMoreOutput 174 } 175 176 if !live { 177 continue 178 } 179 180 if err := w.Flush(); err != nil { 181 return errNoMoreOutput 182 } 183 } 184 185 return sc.Err() 186 } 187 188 func splitKeepCR(data []byte, eof bool) (move int, token []byte, err error) { 189 i, j := indexLineEnd(data) 190 191 if !eof { 192 if i < 0 { 193 return 0, nil, nil 194 } 195 return j, data[:i], nil 196 } 197 198 if len(data) == 0 { 199 return 0, nil, nil 200 } 201 202 if i < 0 { 203 return len(data), data, nil 204 } 205 return len(data), data[:i], nil 206 } 207 208 func indexLineEnd(data []byte) (int, int) { 209 var prev byte 210 211 for i, b := range data { 212 if b == '\n' { 213 if prev == '\r' { 214 return i - 1, i + 1 215 } 216 return i, i + 1 217 } 218 prev = b 219 } 220 221 return -1, -1 222 }