1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
package dedup
import (
"crypto/sha256"
"encoding/binary"
"errors"
"fmt"
"hash/crc64"
"io/fs"
"sync"
"github.com/opencontainers/selinux/pkg/pwalkdir"
"github.com/sirupsen/logrus"
)
var errNotSupported = errors.New("reflinks are not supported on this platform")
const (
DedupHashInvalid DedupHashMethod = iota
DedupHashCRC
DedupHashFileSize
DedupHashSHA256
)
type DedupHashMethod int
type DedupOptions struct {
// HashMethod is the hash function to use to find identical files
HashMethod DedupHashMethod
}
type DedupResult struct {
// Deduped represents the total number of bytes saved by deduplication.
// This value accounts also for all previously deduplicated data, not only the savings
// from the last run.
Deduped uint64
}
func getFileChecksum(hashMethod DedupHashMethod, path string, info fs.FileInfo) (string, error) {
switch hashMethod {
case DedupHashInvalid:
return "", fmt.Errorf("invalid hash method: %v", hashMethod)
case DedupHashFileSize:
return fmt.Sprintf("%v", info.Size()), nil
case DedupHashSHA256:
return readAllFile(path, info, func(buf []byte) (string, error) {
h := sha256.New()
if _, err := h.Write(buf); err != nil {
return "", err
}
return string(h.Sum(nil)), nil
})
case DedupHashCRC:
return readAllFile(path, info, func(buf []byte) (string, error) {
c := crc64.New(crc64.MakeTable(crc64.ECMA))
if _, err := c.Write(buf); err != nil {
return "", err
}
bufRet := make([]byte, 8)
binary.BigEndian.PutUint64(bufRet, c.Sum64())
return string(bufRet), nil
})
default:
return "", fmt.Errorf("unknown hash method: %v", hashMethod)
}
}
type pathsLocked struct {
paths []string
lock sync.Mutex
}
func DedupDirs(dirs []string, options DedupOptions) (DedupResult, error) {
res := DedupResult{}
hashToPaths := make(map[string]*pathsLocked)
lock := sync.Mutex{} // protects `hashToPaths` and `res`
dedup, err := newDedupFiles()
if err != nil {
return res, err
}
for _, dir := range dirs {
logrus.Debugf("Deduping directory %s", dir)
if err := pwalkdir.Walk(dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.Type().IsRegular() {
return nil
}
info, err := d.Info()
if err != nil {
return err
}
size := uint64(info.Size())
if size == 0 {
// do not bother with empty files
return nil
}
// the file was already deduplicated
if visited, err := dedup.isFirstVisitOf(info); err != nil {
return err
} else if visited {
return nil
}
h, err := getFileChecksum(options.HashMethod, path, info)
if err != nil {
return err
}
lock.Lock()
item, foundItem := hashToPaths[h]
if !foundItem {
item = &pathsLocked{paths: []string{path}}
hashToPaths[h] = item
lock.Unlock()
return nil
}
item.lock.Lock()
lock.Unlock()
dedupBytes, err := func() (uint64, error) { // function to have a scope for the defer statement
defer item.lock.Unlock()
var dedupBytes uint64
for _, src := range item.paths {
deduped, err := dedup.dedup(src, path, info)
if err == nil && deduped > 0 {
logrus.Debugf("Deduped %q -> %q (%d bytes)", src, path, deduped)
dedupBytes += deduped
break
}
logrus.Debugf("Failed to deduplicate: %v", err)
if errors.Is(err, errNotSupported) {
return dedupBytes, err
}
}
if dedupBytes == 0 {
item.paths = append(item.paths, path)
}
return dedupBytes, nil
}()
if err != nil {
return err
}
lock.Lock()
res.Deduped += dedupBytes
lock.Unlock()
return nil
}); err != nil {
// if reflinks are not supported, return immediately without errors
if errors.Is(err, errNotSupported) {
return res, nil
}
return res, err
}
}
return res, nil
}
|