File: dedup.go

package info (click to toggle)
golang-github-containers-storage 1.59.1%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 4,184 kB
  • sloc: sh: 630; ansic: 389; makefile: 143; awk: 12
file content (163 lines) | stat: -rw-r--r-- 3,835 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
package dedup

import (
	"crypto/sha256"
	"encoding/binary"
	"errors"
	"fmt"
	"hash/crc64"
	"io/fs"
	"sync"

	"github.com/opencontainers/selinux/pkg/pwalkdir"
	"github.com/sirupsen/logrus"
)

var errNotSupported = errors.New("reflinks are not supported on this platform")

const (
	DedupHashInvalid DedupHashMethod = iota
	DedupHashCRC
	DedupHashFileSize
	DedupHashSHA256
)

type DedupHashMethod int

type DedupOptions struct {
	// HashMethod is the hash function to use to find identical files
	HashMethod DedupHashMethod
}

type DedupResult struct {
	// Deduped represents the total number of bytes saved by deduplication.
	// This value accounts also for all previously deduplicated data, not only the savings
	// from the last run.
	Deduped uint64
}

func getFileChecksum(hashMethod DedupHashMethod, path string, info fs.FileInfo) (string, error) {
	switch hashMethod {
	case DedupHashInvalid:
		return "", fmt.Errorf("invalid hash method: %v", hashMethod)
	case DedupHashFileSize:
		return fmt.Sprintf("%v", info.Size()), nil
	case DedupHashSHA256:
		return readAllFile(path, info, func(buf []byte) (string, error) {
			h := sha256.New()
			if _, err := h.Write(buf); err != nil {
				return "", err
			}
			return string(h.Sum(nil)), nil
		})
	case DedupHashCRC:
		return readAllFile(path, info, func(buf []byte) (string, error) {
			c := crc64.New(crc64.MakeTable(crc64.ECMA))
			if _, err := c.Write(buf); err != nil {
				return "", err
			}
			bufRet := make([]byte, 8)
			binary.BigEndian.PutUint64(bufRet, c.Sum64())
			return string(bufRet), nil
		})
	default:
		return "", fmt.Errorf("unknown hash method: %v", hashMethod)
	}
}

type pathsLocked struct {
	paths []string
	lock  sync.Mutex
}

func DedupDirs(dirs []string, options DedupOptions) (DedupResult, error) {
	res := DedupResult{}
	hashToPaths := make(map[string]*pathsLocked)
	lock := sync.Mutex{} // protects `hashToPaths` and `res`

	dedup, err := newDedupFiles()
	if err != nil {
		return res, err
	}

	for _, dir := range dirs {
		logrus.Debugf("Deduping directory %s", dir)
		if err := pwalkdir.Walk(dir, func(path string, d fs.DirEntry, err error) error {
			if err != nil {
				return err
			}
			if !d.Type().IsRegular() {
				return nil
			}
			info, err := d.Info()
			if err != nil {
				return err
			}
			size := uint64(info.Size())
			if size == 0 {
				// do not bother with empty files
				return nil
			}

			// the file was already deduplicated
			if visited, err := dedup.isFirstVisitOf(info); err != nil {
				return err
			} else if visited {
				return nil
			}

			h, err := getFileChecksum(options.HashMethod, path, info)
			if err != nil {
				return err
			}

			lock.Lock()
			item, foundItem := hashToPaths[h]
			if !foundItem {
				item = &pathsLocked{paths: []string{path}}
				hashToPaths[h] = item
				lock.Unlock()
				return nil
			}
			item.lock.Lock()
			lock.Unlock()

			dedupBytes, err := func() (uint64, error) { // function to have a scope for the defer statement
				defer item.lock.Unlock()

				var dedupBytes uint64
				for _, src := range item.paths {
					deduped, err := dedup.dedup(src, path, info)
					if err == nil && deduped > 0 {
						logrus.Debugf("Deduped %q -> %q (%d bytes)", src, path, deduped)
						dedupBytes += deduped
						break
					}
					logrus.Debugf("Failed to deduplicate: %v", err)
					if errors.Is(err, errNotSupported) {
						return dedupBytes, err
					}
				}
				if dedupBytes == 0 {
					item.paths = append(item.paths, path)
				}
				return dedupBytes, nil
			}()
			if err != nil {
				return err
			}

			lock.Lock()
			res.Deduped += dedupBytes
			lock.Unlock()
			return nil
		}); err != nil {
			// if reflinks are not supported, return immediately without errors
			if errors.Is(err, errNotSupported) {
				return res, nil
			}
			return res, err
		}
	}
	return res, nil
}