File: shame512.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath shame512.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "crypto/sha512" 37 "encoding/hex" 38 "errors" 39 "io" 40 "io/fs" 41 "os" 42 "path/filepath" 43 "runtime" 44 "sort" 45 "sync" 46 ) 47 48 const info = ` 49 shame512 [options...] [files/folders...] 50 51 Group files by common/same SHA-512 hashes: having the same hash means a high 52 chance of having the same contents, even though that's not a guarantee. 53 54 Files and folder names can be given together for convenience: folder names 55 are searched recursively to find all files, including in any subfolders. 56 57 The only option available is to show this help message, using any of 58 "-h", "--h", "-help", or "--help", without the quotes. 59 ` 60 61 func main() { 62 args := os.Args[1:] 63 64 if len(args) > 0 { 65 switch args[0] { 66 case `-h`, `--h`, `-help`, `--help`: 67 os.Stdout.WriteString(info[1:]) 68 return 69 70 case `--`: 71 args = args[1:] 72 } 73 } 74 75 // paths has all filepaths given, ignoring repetitions 76 paths, ok := findAllFiles(deduplicate(args)) 77 if !ok { 78 os.Exit(1) 79 } 80 if len(paths) == 0 { 81 paths = []string{`.`} 82 } 83 84 checksums := make([]string, len(paths)) 85 findChecksums(paths, checksums) 86 87 // group filepaths by common checksum, remembering the index of the 88 // first item for each group, so the groups can be sorted by the 89 // original order their first filepath came from the command-line 90 indices := make(map[string]int) 91 groups := make(map[string][]string) 92 for i, chsum := range checksums { 93 if chsum == `` { 94 continue 95 } 96 g, ok := groups[chsum] 97 if !ok { 98 indices[chsum] = i 99 } 100 groups[chsum] = append(g, paths[i]) 101 } 102 103 // keys has the sorted checksums 104 keys := make([]string, len(groups)) 105 for chsum := range groups { 106 keys = append(keys, chsum) 107 } 108 sort.SliceStable(keys, func(i, j int) bool { 109 x := indices[keys[i]] 110 y := indices[keys[j]] 111 return x < y 112 }) 113 114 shown := 0 115 bw := bufio.NewWriter(os.Stdout) 116 defer bw.Flush() 117 118 for _, chsum := range keys { 119 if shown > 0 { 120 bw.WriteByte('\n') 121 } 122 123 bw.WriteString(chsum) 124 bw.WriteByte('\n') 125 126 files := groups[chsum] 127 for _, path := range files { 128 bw.WriteString(path) 129 bw.WriteByte('\n') 130 } 131 132 shown++ 133 } 134 } 135 136 // findAllFiles can be given a mix of file/folder paths, finding all files 137 // recursively in folders, avoiding duplicates 138 func findAllFiles(paths []string) (found []string, ok bool) { 139 res := make(chan any) 140 var all sync.WaitGroup 141 all.Add(1) 142 143 go func() { 144 defer all.Done() 145 got := make(map[string]struct{}) 146 ok = true 147 148 for v := range res { 149 if err, ok := v.(error); ok { 150 showError(``, err) 151 ok = false 152 continue 153 } 154 155 s, ok := v.(string) 156 if !ok { 157 showError(``, errors.New(`value is neither string nor error`)) 158 ok = false 159 continue 160 } 161 162 if _, ok := got[s]; ok { 163 continue 164 } 165 166 got[s] = struct{}{} 167 found = append(found, s) 168 } 169 }() 170 171 rec := func(path string, info fs.DirEntry, err error) error { 172 if err != nil { 173 res <- err 174 return err 175 } 176 177 if info.IsDir() { 178 return nil 179 } 180 181 res <- path 182 return nil 183 } 184 185 for _, s := range paths { 186 // a dash means standard input 187 if s == `-` { 188 res <- s 189 continue 190 } 191 192 info, err := os.Stat(s) 193 if os.IsNotExist(err) { 194 // on windows, file-not-found messages may mention `CreateFile`, 195 // even when trying to open files in read-only mode 196 res <- errors.New(`can't find file/folder named ` + s) 197 continue 198 } 199 200 if err != nil { 201 res <- err 202 continue 203 } 204 205 if !info.IsDir() { 206 res <- s 207 continue 208 } 209 210 if err := filepath.WalkDir(s, rec); err != nil { 211 res <- err 212 } 213 } 214 215 close(res) 216 all.Wait() 217 218 return found, ok 219 } 220 221 var showErrorMutex sync.Mutex 222 223 // showError is safe to call concurrently 224 func showError(path string, err error) { 225 showErrorMutex.Lock() 226 defer showErrorMutex.Unlock() 227 228 if path != `` { 229 os.Stderr.WriteString(path) 230 os.Stderr.WriteString(`: `) 231 } 232 os.Stderr.WriteString(err.Error()) 233 os.Stderr.WriteString("\n") 234 } 235 236 func deduplicate(values []string) []string { 237 got := make(map[string]struct{}) 238 unique := make([]string, 0, len(values)) 239 240 for _, s := range values { 241 if _, ok := got[s]; ok { 242 continue 243 } 244 got[s] = struct{}{} 245 unique = append(unique, s) 246 } 247 248 return unique 249 } 250 251 type asyncArgs struct { 252 // Permissions limits how many worker tasks can be active at the same 253 // time: when given many filepaths to work on, rate-limiting avoids 254 // a massive number of concurrent tasks which read and process input 255 Permissions chan struct{} 256 257 // Tasks is to wait for all asynchronous tasks to end 258 Tasks *sync.WaitGroup 259 } 260 261 func findChecksums(paths []string, sums []string) { 262 var tasks sync.WaitGroup 263 // the number of tasks is always known in advance 264 tasks.Add(len(paths)) 265 266 args := asyncArgs{ 267 Permissions: make(chan struct{}, runtime.NumCPU()), 268 Tasks: &tasks, 269 } 270 271 defer close(args.Permissions) 272 273 for i, path := range paths { 274 // wait until some concurrency-room is available 275 args.Permissions <- struct{}{} 276 go checksum(i, path, sums, args) 277 } 278 279 // wait for all tasks to finish 280 args.Tasks.Wait() 281 } 282 283 // checksum reports the checksum of a file's bytes 284 func checksum(i int, path string, sums []string, args asyncArgs) { 285 chsum, err := sha(path) 286 if err != nil { 287 chsum = `` 288 showError(path, err) 289 } 290 291 sums[i] = chsum 292 <-args.Permissions 293 args.Tasks.Done() 294 } 295 296 // sha calculates a checksum for a file's contents 297 func sha(path string) (string, error) { 298 f, err := os.Open(path) 299 if err != nil { 300 return ``, err 301 } 302 defer f.Close() 303 304 sha := sha512.New() 305 _, err = io.Copy(sha, f) 306 if err != nil { 307 return ``, err 308 } 309 310 // buf has room to fit a SHA-512 hash exactly 311 var buf [64]byte 312 return hex.EncodeToString(sha.Sum(buf[:0])), err 313 }