File: shame512.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath shame512.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "crypto/sha512"
  37     "encoding/hex"
  38     "errors"
  39     "io"
  40     "io/fs"
  41     "os"
  42     "path/filepath"
  43     "runtime"
  44     "sort"
  45     "sync"
  46 )
  47 
  48 const info = `
  49 shame512 [options...] [files/folders...]
  50 
  51 Group files by common/same SHA-512 hashes: having the same hash means a high
  52 chance of having the same contents, even though that's not a guarantee.
  53 
  54 Files and folder names can be given together for convenience: folder names
  55 are searched recursively to find all files, including in any subfolders.
  56 
  57 The only option available is to show this help message, using any of
  58 "-h", "--h", "-help", or "--help", without the quotes.
  59 `
  60 
  61 func main() {
  62     args := os.Args[1:]
  63 
  64     if len(args) > 0 {
  65         switch args[0] {
  66         case `-h`, `--h`, `-help`, `--help`:
  67             os.Stdout.WriteString(info[1:])
  68             return
  69 
  70         case `--`:
  71             args = args[1:]
  72         }
  73     }
  74 
  75     // paths has all filepaths given, ignoring repetitions
  76     paths, ok := findAllFiles(deduplicate(args))
  77     if !ok {
  78         os.Exit(1)
  79     }
  80     if len(paths) == 0 {
  81         paths = []string{`.`}
  82     }
  83 
  84     checksums := make([]string, len(paths))
  85     findChecksums(paths, checksums)
  86 
  87     // group filepaths by common checksum, remembering the index of the
  88     // first item for each group, so the groups can be sorted by the
  89     // original order their first filepath came from the command-line
  90     indices := make(map[string]int)
  91     groups := make(map[string][]string)
  92     for i, chsum := range checksums {
  93         if chsum == `` {
  94             continue
  95         }
  96         g, ok := groups[chsum]
  97         if !ok {
  98             indices[chsum] = i
  99         }
 100         groups[chsum] = append(g, paths[i])
 101     }
 102 
 103     // keys has the sorted checksums
 104     keys := make([]string, len(groups))
 105     for chsum := range groups {
 106         keys = append(keys, chsum)
 107     }
 108     sort.SliceStable(keys, func(i, j int) bool {
 109         x := indices[keys[i]]
 110         y := indices[keys[j]]
 111         return x < y
 112     })
 113 
 114     shown := 0
 115     bw := bufio.NewWriter(os.Stdout)
 116     defer bw.Flush()
 117 
 118     for _, chsum := range keys {
 119         if shown > 0 {
 120             bw.WriteByte('\n')
 121         }
 122 
 123         bw.WriteString(chsum)
 124         bw.WriteByte('\n')
 125 
 126         files := groups[chsum]
 127         for _, path := range files {
 128             bw.WriteString(path)
 129             bw.WriteByte('\n')
 130         }
 131 
 132         shown++
 133     }
 134 }
 135 
 136 // findAllFiles can be given a mix of file/folder paths, finding all files
 137 // recursively in folders, avoiding duplicates
 138 func findAllFiles(paths []string) (found []string, ok bool) {
 139     res := make(chan any)
 140     var all sync.WaitGroup
 141     all.Add(1)
 142 
 143     go func() {
 144         defer all.Done()
 145         got := make(map[string]struct{})
 146         ok = true
 147 
 148         for v := range res {
 149             if err, ok := v.(error); ok {
 150                 showError(``, err)
 151                 ok = false
 152                 continue
 153             }
 154 
 155             s, ok := v.(string)
 156             if !ok {
 157                 showError(``, errors.New(`value is neither string nor error`))
 158                 ok = false
 159                 continue
 160             }
 161 
 162             if _, ok := got[s]; ok {
 163                 continue
 164             }
 165 
 166             got[s] = struct{}{}
 167             found = append(found, s)
 168         }
 169     }()
 170 
 171     rec := func(path string, info fs.DirEntry, err error) error {
 172         if err != nil {
 173             res <- err
 174             return err
 175         }
 176 
 177         if info.IsDir() {
 178             return nil
 179         }
 180 
 181         res <- path
 182         return nil
 183     }
 184 
 185     for _, s := range paths {
 186         // a dash means standard input
 187         if s == `-` {
 188             res <- s
 189             continue
 190         }
 191 
 192         info, err := os.Stat(s)
 193         if os.IsNotExist(err) {
 194             // on windows, file-not-found messages may mention `CreateFile`,
 195             // even when trying to open files in read-only mode
 196             res <- errors.New(`can't find file/folder named ` + s)
 197             continue
 198         }
 199 
 200         if err != nil {
 201             res <- err
 202             continue
 203         }
 204 
 205         if !info.IsDir() {
 206             res <- s
 207             continue
 208         }
 209 
 210         if err := filepath.WalkDir(s, rec); err != nil {
 211             res <- err
 212         }
 213     }
 214 
 215     close(res)
 216     all.Wait()
 217 
 218     return found, ok
 219 }
 220 
 221 var showErrorMutex sync.Mutex
 222 
 223 // showError is safe to call concurrently
 224 func showError(path string, err error) {
 225     showErrorMutex.Lock()
 226     defer showErrorMutex.Unlock()
 227 
 228     if path != `` {
 229         os.Stderr.WriteString(path)
 230         os.Stderr.WriteString(`: `)
 231     }
 232     os.Stderr.WriteString(err.Error())
 233     os.Stderr.WriteString("\n")
 234 }
 235 
 236 func deduplicate(values []string) []string {
 237     got := make(map[string]struct{})
 238     unique := make([]string, 0, len(values))
 239 
 240     for _, s := range values {
 241         if _, ok := got[s]; ok {
 242             continue
 243         }
 244         got[s] = struct{}{}
 245         unique = append(unique, s)
 246     }
 247 
 248     return unique
 249 }
 250 
 251 type asyncArgs struct {
 252     // Permissions limits how many worker tasks can be active at the same
 253     // time: when given many filepaths to work on, rate-limiting avoids
 254     // a massive number of concurrent tasks which read and process input
 255     Permissions chan struct{}
 256 
 257     // Tasks is to wait for all asynchronous tasks to end
 258     Tasks *sync.WaitGroup
 259 }
 260 
 261 func findChecksums(paths []string, sums []string) {
 262     var tasks sync.WaitGroup
 263     // the number of tasks is always known in advance
 264     tasks.Add(len(paths))
 265 
 266     args := asyncArgs{
 267         Permissions: make(chan struct{}, runtime.NumCPU()),
 268         Tasks:       &tasks,
 269     }
 270 
 271     defer close(args.Permissions)
 272 
 273     for i, path := range paths {
 274         // wait until some concurrency-room is available
 275         args.Permissions <- struct{}{}
 276         go checksum(i, path, sums, args)
 277     }
 278 
 279     // wait for all tasks to finish
 280     args.Tasks.Wait()
 281 }
 282 
 283 // checksum reports the checksum of a file's bytes
 284 func checksum(i int, path string, sums []string, args asyncArgs) {
 285     chsum, err := sha(path)
 286     if err != nil {
 287         chsum = ``
 288         showError(path, err)
 289     }
 290 
 291     sums[i] = chsum
 292     <-args.Permissions
 293     args.Tasks.Done()
 294 }
 295 
 296 // sha calculates a checksum for a file's contents
 297 func sha(path string) (string, error) {
 298     f, err := os.Open(path)
 299     if err != nil {
 300         return ``, err
 301     }
 302     defer f.Close()
 303 
 304     sha := sha512.New()
 305     _, err = io.Copy(sha, f)
 306     if err != nil {
 307         return ``, err
 308     }
 309 
 310     // buf has room to fit a SHA-512 hash exactly
 311     var buf [64]byte
 312     return hex.EncodeToString(sha.Sum(buf[:0])), err
 313 }