File: shame512.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath shame512.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "crypto/sha512"
  37     "encoding/hex"
  38     "errors"
  39     "io"
  40     "io/fs"
  41     "os"
  42     "path/filepath"
  43     "runtime"
  44     "sort"
  45     "sync"
  46 )
  47 
  48 const info = `
  49 shame512 [options...] [files/folders...]
  50 
  51 Group files by common/same SHA-512 hashes: having the same hash means a high
  52 chance of having the same contents, even though that's not a guarantee.
  53 
  54 Files and folder names can be given together for convenience: folder names
  55 are searched recursively to find all files, including in any subfolders.
  56 
  57 The only option available is to show this help message, using any of
  58 "-h", "--h", "-help", or "--help", without the quotes.
  59 `
  60 
  61 func main() {
  62     args := os.Args[1:]
  63 
  64     if len(args) > 0 {
  65         switch args[0] {
  66         case `-h`, `--h`, `-help`, `--help`:
  67             os.Stdout.WriteString(info[1:])
  68             return
  69 
  70         case `--`:
  71             args = args[1:]
  72         }
  73     }
  74 
  75     // paths has all filepaths given, ignoring repetitions
  76     paths, ok := findAllFiles(args)
  77     if !ok {
  78         os.Exit(1)
  79     }
  80     if len(paths) == 0 {
  81         paths = []string{`.`}
  82     }
  83 
  84     checksums := make([]string, len(paths))
  85     findChecksums(paths, checksums)
  86 
  87     // group filepaths by common checksum, remembering the index of the
  88     // first item for each group, so the groups can be sorted by the
  89     // original order their first filepath came from the command-line
  90     indices := make(map[string]int)
  91     groups := make(map[string][]string)
  92     for i, chsum := range checksums {
  93         if chsum == `` {
  94             continue
  95         }
  96         g, ok := groups[chsum]
  97         if !ok {
  98             indices[chsum] = i
  99         }
 100         groups[chsum] = append(g, paths[i])
 101     }
 102 
 103     // keys has the sorted checksums
 104     keys := make([]string, len(groups))
 105     for chsum := range groups {
 106         keys = append(keys, chsum)
 107     }
 108     sort.SliceStable(keys, func(i, j int) bool {
 109         x := indices[keys[i]]
 110         y := indices[keys[j]]
 111         return x < y
 112     })
 113 
 114     shown := 0
 115     bw := bufio.NewWriter(os.Stdout)
 116     defer bw.Flush()
 117 
 118     for _, chsum := range keys {
 119         if shown > 0 {
 120             bw.WriteByte('\n')
 121         }
 122 
 123         bw.WriteString(chsum)
 124         bw.WriteByte('\n')
 125 
 126         files := groups[chsum]
 127         for _, path := range files {
 128             bw.WriteString(path)
 129             bw.WriteByte('\n')
 130         }
 131 
 132         shown++
 133     }
 134 }
 135 
 136 // findAllFiles can be given a mix of file/folder paths, finding all files
 137 // recursively in folders, avoiding duplicates
 138 func findAllFiles(paths []string) (files []string, success bool) {
 139     walk := filepath.WalkDir
 140     got := make(map[string]struct{})
 141     success = true
 142 
 143     for _, path := range paths {
 144         if _, ok := got[path]; ok {
 145             continue
 146         }
 147         got[path] = struct{}{}
 148 
 149         // a dash means standard input
 150         if path == `-` {
 151             files = append(files, path)
 152             continue
 153         }
 154 
 155         info, err := os.Stat(path)
 156         if os.IsNotExist(err) {
 157             // on windows, file-not-found messages may mention `CreateFile`,
 158             // even when trying to open files in read-only mode
 159             err = errors.New(`can't find file/folder named ` + path)
 160         }
 161 
 162         if err != nil {
 163             showError(path, err)
 164             success = false
 165             continue
 166         }
 167 
 168         if !info.IsDir() {
 169             files = append(files, path)
 170             continue
 171         }
 172 
 173         err = walk(path, func(path string, info fs.DirEntry, err error) error {
 174             path, err = filepath.Abs(path)
 175             if err != nil {
 176                 showError(path, err)
 177                 success = false
 178                 return err
 179             }
 180 
 181             if _, ok := got[path]; ok {
 182                 if info.IsDir() {
 183                     return fs.SkipDir
 184                 }
 185                 return nil
 186             }
 187             got[path] = struct{}{}
 188 
 189             if info.IsDir() {
 190                 return nil
 191             }
 192 
 193             files = append(files, path)
 194             return nil
 195         })
 196 
 197         if err != nil {
 198             showError(path, err)
 199             success = false
 200         }
 201     }
 202 
 203     return files, success
 204 }
 205 
 206 func showError(path string, err error) {
 207     if path != `` {
 208         os.Stderr.WriteString(path)
 209         os.Stderr.WriteString(`: `)
 210     }
 211     os.Stderr.WriteString(err.Error())
 212     os.Stderr.WriteString("\n")
 213 }
 214 
 215 func findChecksums(paths []string, sums []string) {
 216     var tasks sync.WaitGroup
 217     // the number of tasks is always known in advance
 218     tasks.Add(len(paths))
 219 
 220     // permissions is buffered to limit concurrency to the core-count
 221     permissions := make(chan struct{}, runtime.NumCPU())
 222     defer close(permissions)
 223 
 224     for i, path := range paths {
 225         // wait until some concurrency-room is available, before proceeding
 226         permissions <- struct{}{}
 227 
 228         go func(i int, path string) {
 229             defer tasks.Done()
 230 
 231             chsum, err := sha(path)
 232             if err != nil {
 233                 chsum = ``
 234                 showError(path, err)
 235             }
 236 
 237             sums[i] = chsum
 238             <-permissions
 239         }(i, path)
 240     }
 241 
 242     // wait for all tasks to finish
 243     tasks.Wait()
 244 }
 245 
 246 // sha calculates a checksum for a file's contents
 247 func sha(path string) (string, error) {
 248     f, err := os.Open(path)
 249     if err != nil {
 250         return ``, err
 251     }
 252     defer f.Close()
 253 
 254     sha := sha512.New()
 255     _, err = io.Copy(sha, f)
 256     if err != nil {
 257         return ``, err
 258     }
 259 
 260     // buf has room to fit a SHA-512 hash exactly: while its hexadecimal-ASCII
 261     // rendition is 128 bytes, the checksum itself is 64 bytes
 262     var buf [64]byte
 263     return hex.EncodeToString(sha.Sum(buf[:0])), err
 264 }