File: rewriter.go

package info (click to toggle)
git-lfs 3.6.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,808 kB
  • sloc: sh: 21,256; makefile: 507; ruby: 417
file content (629 lines) | stat: -rw-r--r-- 19,000 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
package githistory

import (
	"encoding/hex"
	"fmt"
	"os"
	"strings"
	"sync"

	"github.com/git-lfs/git-lfs/v3/errors"
	"github.com/git-lfs/git-lfs/v3/filepathfilter"
	"github.com/git-lfs/git-lfs/v3/git"
	"github.com/git-lfs/git-lfs/v3/tasklog"
	"github.com/git-lfs/git-lfs/v3/tr"
	"github.com/git-lfs/gitobj/v2"
)

// Rewriter allows rewriting topologically equivalent Git histories
// between two revisions.
type Rewriter struct {
	// mu guards entries and commits (see below)
	mu *sync.Mutex
	// entries is a mapping of old tree entries to new (rewritten) ones.
	// Since TreeEntry contains a []byte (and is therefore not a key-able
	// type), a unique TreeEntry -> string function is used for map keys.
	entries map[string]*gitobj.TreeEntry
	// commits is a mapping of old commit SHAs to new ones, where the ASCII
	// hex encoding of the SHA1 values are used as map keys.
	commits map[string][]byte
	// filter is an optional value used to specify which tree entries
	// (blobs, subtrees) are modifiable given a BlobFn. If non-nil, this
	// filter will cull out any unmodifiable subtrees and blobs.
	filter *filepathfilter.Filter
	// db is the *ObjectDatabase from which blobs, commits, and trees are
	// loaded from.
	db *gitobj.ObjectDatabase
	// l is the *tasklog.Logger to which updates are written.
	l *tasklog.Logger
}

// RewriteOptions is an options type given to the Rewrite() function.
type RewriteOptions struct {
	// Include is the list of refs of which commits reachable by that ref
	// will be included.
	Include []string
	// Exclude is the list of refs of which commits reachable by that ref
	// will be excluded.
	Exclude []string

	// UpdateRefs specifies whether the Rewriter should move refs from the
	// original graph onto the migrated one. If true, the refs will be
	// moved, and a reflog entry will be created.
	UpdateRefs bool

	// Verbose mode prints migrated objects.
	Verbose bool

	// ObjectMapFilePath is the path to the map of old sha1 to new sha1
	// commits
	ObjectMapFilePath string

	// BlobFn specifies a function to rewrite blobs.
	//
	// It is called once per unique, unchanged path. That is to say, if
	// /a/foo and /a/bar contain identical contents, the BlobFn will be
	// called twice: once for /a/foo and once for /a/bar, but no more on
	// each blob for subsequent revisions, so long as each entry remains
	// unchanged.
	BlobFn BlobRewriteFn
	// TreePreCallbackFn specifies a function to be called before opening a
	// tree for rewriting. It will be called on all trees throughout history
	// in topological ordering through the tree, starting at the root.
	TreePreCallbackFn TreePreCallbackFn
	// TreeCallbackFn specifies a function to rewrite trees after they have
	// been reassembled by calling the above BlobFn on all existing tree
	// entries.
	TreeCallbackFn TreeCallbackFn
}

// blobFn returns a usable BlobRewriteFn, either the one that was given in the
// *RewriteOptions, or a noopBlobFn.
func (r *RewriteOptions) blobFn() BlobRewriteFn {
	if r.BlobFn == nil {
		return noopBlobFn
	}
	return r.BlobFn
}

// treePreFn returns a usable TreePreCallbackFn, either the one that was given
// in the *RewriteOptions, or a noopTreePreFn.
func (r *RewriteOptions) treePreFn() TreePreCallbackFn {
	if r.TreePreCallbackFn == nil {
		return noopTreePreFn
	}
	return r.TreePreCallbackFn
}

// treeFn returns a usable TreeRewriteFn, either the one that was given in the
// *RewriteOptions, or a noopTreeFn.
func (r *RewriteOptions) treeFn() TreeCallbackFn {
	if r.TreeCallbackFn == nil {
		return noopTreeFn
	}
	return r.TreeCallbackFn
}

// BlobRewriteFn is a mapping function that takes a given blob and returns a
// new, modified blob. If it returns an error, the new blob will not be written
// and instead the error will be returned from the Rewrite() function.
//
// Invocations of an instance of BlobRewriteFn are not expected to store the
// returned blobs in the *git/gitobj.ObjectDatabase.
//
// The path argument is given to be an absolute path to the tree entry being
// rewritten, where the repository root is the root of the path given. For
// instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt",
// where as a file "a.txt" in the root would be given as "/a.txt".
//
// As above, the path separators are OS specific, and equivalent to the result
// of filepath.Join(...) or os.PathSeparator.
type BlobRewriteFn func(path string, b *gitobj.Blob) (*gitobj.Blob, error)

// TreePreCallbackFn specifies a function to call upon opening a new tree for
// rewriting.
//
// Unlike its sibling TreeCallbackFn, TreePreCallbackFn may not modify the given
// tree.
//
// TreePreCallbackFn can be nil, and will therefore exhibit behavior equivalent
// to only calling the BlobFn on existing tree entries.
//
// If the TreePreCallbackFn returns an error, it will be returned from the
// Rewrite() invocation.
type TreePreCallbackFn func(path string, t *gitobj.Tree) error

// TreeCallbackFn specifies a function to call before writing a re-written tree
// to the object database. The TreeCallbackFn can return a modified tree to be
// written to the object database instead of one generated from calling BlobFn
// on all of the tree entries.
//
// TreeCallbackFn can be nil, and will therefore exhibit behavior equivalent to
// only calling the BlobFn on existing tree entries.
//
// If the TreeCallbackFn returns an error, it will be returned from the
// Rewrite() invocation.
type TreeCallbackFn func(path string, t *gitobj.Tree) (*gitobj.Tree, error)

type rewriterOption func(*Rewriter)

var (
	// WithFilter is an optional argument given to the NewRewriter
	// constructor function to limit invocations of the BlobRewriteFn to
	// only pathspecs that match the given *filepathfilter.Filter.
	WithFilter = func(filter *filepathfilter.Filter) rewriterOption {
		return func(r *Rewriter) {
			r.filter = filter
		}
	}

	// WithLogger logs updates caused by the *git/githistory.Rewriter to the
	// be given to the provided logger, "l".
	WithLogger = func(l *tasklog.Logger) rewriterOption {
		return func(r *Rewriter) {
			r.l = l
		}
	}

	// noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns
	// the blob that it was given, and returns no error.
	noopBlobFn = func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }
	// noopTreePreFn is a no-op implementation of the TreePreRewriteFn. It
	// returns the tree that it was given, and returns no error.
	noopTreePreFn = func(path string, t *gitobj.Tree) error { return nil }
	// noopTreeFn is a no-op implementation of the TreeRewriteFn. It returns
	// the tree that it was given, and returns no error.
	noopTreeFn = func(path string, t *gitobj.Tree) (*gitobj.Tree, error) { return t, nil }
)

// NewRewriter constructs a *Rewriter from the given *ObjectDatabase instance.
func NewRewriter(db *gitobj.ObjectDatabase, opts ...rewriterOption) *Rewriter {
	rewriter := &Rewriter{
		mu:      new(sync.Mutex),
		entries: make(map[string]*gitobj.TreeEntry),
		commits: make(map[string][]byte),

		db: db,
	}

	for _, opt := range opts {
		opt(rewriter)
	}
	return rewriter
}

// Rewrite rewrites the range of commits given by
// *RewriteOptions.{Include,Exclude} using the BlobRewriteFn to rewrite
// the individual blobs.
func (r *Rewriter) Rewrite(opt *RewriteOptions) ([]byte, error) {
	// First, obtain a list of commits to rewrite.
	commits, err := r.commitsToMigrate(opt)
	if err != nil {
		return nil, err
	}

	var perc *tasklog.PercentageTask
	if opt.UpdateRefs {
		perc = r.l.Percentage(fmt.Sprintf("migrate: %s", tr.Tr.Get("Rewriting commits")), uint64(len(commits)))
	} else {
		perc = r.l.Percentage(fmt.Sprintf("migrate: %s", tr.Tr.Get("Examining commits")), uint64(len(commits)))
	}
	defer perc.Complete()

	var vPerc *tasklog.PercentageTask
	if opt.Verbose {
		vPerc = perc
	}

	var objectMapFile *os.File
	if len(opt.ObjectMapFilePath) > 0 {
		objectMapFile, err = os.OpenFile(opt.ObjectMapFilePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
		if err != nil {
			return nil, errors.New(tr.Tr.Get("could not create object map file: %v", err))
		}
		defer objectMapFile.Close()
	}

	// Keep track of the last commit that we rewrote. Callers often want
	// this so that they can perform a git-update-ref(1).
	var tip []byte
	for _, oid := range commits {
		// Load the original commit to access the data necessary in
		// order to rewrite it.
		original, err := r.db.Commit(oid)
		if err != nil {
			return nil, err
		}

		// Rewrite the tree given at that commit.
		rewrittenTree, err := r.rewriteTree(oid, original.TreeID, "", opt.blobFn(), opt.treePreFn(), opt.treeFn(), vPerc)
		if err != nil {
			return nil, err
		}

		// Create a new list of parents from the original commit to
		// point at the rewritten parents in order to create a
		// topologically equivalent DAG.
		//
		// This operation is safe since we are visiting the commits in
		// reverse topological order and therefore have seen all parents
		// before children (in other words, r.uncacheCommit(...) will
		// always return a value, if the prospective parent is a part of
		// the migration).
		rewrittenParents := make([][]byte, 0, len(original.ParentIDs))
		for _, originalParent := range original.ParentIDs {
			rewrittenParent, ok := r.uncacheCommit(originalParent)
			if !ok {
				// If we haven't seen the parent before, this
				// means that we're doing a partial migration
				// and the parent that we're looking for isn't
				// included.
				//
				// Use the original parent to properly link
				// history across the migration boundary.
				rewrittenParent = originalParent
			}

			rewrittenParents = append(rewrittenParents, rewrittenParent)
		}

		// Construct a new commit using the original header information,
		// but the rewritten set of parents as well as root tree.
		rewrittenCommit := &gitobj.Commit{
			Author:       original.Author,
			Committer:    original.Committer,
			ExtraHeaders: original.ExtraHeaders,
			Message:      original.Message,

			ParentIDs: rewrittenParents,
			TreeID:    rewrittenTree,
		}

		var newSha []byte

		if original.Equal(rewrittenCommit) {
			newSha = make([]byte, len(oid))
			copy(newSha, oid)
		} else {
			newSha, err = r.db.WriteCommit(rewrittenCommit)
			if err != nil {
				return nil, err
			}
			if objectMapFile != nil {
				if _, err := fmt.Fprintf(objectMapFile, "%x,%x\n", oid, newSha); err != nil {
					return nil, err
				}
			}
		}

		// Cache that commit so that we can reassign children of this
		// commit.
		r.cacheCommit(oid, newSha)

		// Increment the percentage displayed in the terminal.
		perc.Count(1)

		// Move the tip forward.
		tip = newSha
	}

	if opt.UpdateRefs {
		refs, err := r.refsToMigrate()
		if err != nil {
			return nil, errors.Wrap(err, tr.Tr.Get("could not find refs to update"))
		}

		root, _ := r.db.Root()

		updater := &refUpdater{
			CacheFn: r.uncacheCommit,
			Logger:  r.l,
			Refs:    refs,
			Root:    root,

			db: r.db,
		}

		if err := updater.UpdateRefs(); err != nil {
			return nil, errors.Wrap(err, tr.Tr.Get("could not update refs"))
		}
	}

	return tip, err
}

// rewriteTree is a recursive function which rewrites a tree given by the ID
// "sha" and path "path". It uses the given BlobRewriteFn to rewrite all blobs
// within the tree, either calling that function or recurring down into subtrees
// by re-assigning the SHA.
//
// Once it is done assembling the entries in a given subtree, it then calls the
// TreeCallbackFn, "tfn" to perform a final traversal of the subtree before
// saving it to the object database.
//
// It returns the new SHA of the rewritten tree, or an error if the tree was
// unable to be rewritten.
func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string,
	fn BlobRewriteFn, tpfn TreePreCallbackFn, tfn TreeCallbackFn,
	perc *tasklog.PercentageTask) ([]byte, error) {

	tree, err := r.db.Tree(treeOID)
	if err != nil {
		return nil, err
	}

	if err := tpfn("/"+path, tree); err != nil {
		return nil, err
	}

	entries := make([]*gitobj.TreeEntry, 0, len(tree.Entries))
	for _, entry := range tree.Entries {
		var fullpath string
		if len(path) > 0 {
			fullpath = strings.Join([]string{path, entry.Name}, "/")
		} else {
			fullpath = entry.Name
		}

		if !r.allows(entry.Type(), fullpath) {
			entries = append(entries, copyEntry(entry))
			continue
		}

		// If this is a symlink, skip it
		if entry.Filemode == 0120000 {
			entries = append(entries, copyEntry(entry))
			continue
		}

		if cached := r.uncacheEntry(fullpath, entry); cached != nil {
			entries = append(entries, copyEntryMode(cached,
				entry.Filemode))
			continue
		}

		var oid []byte

		switch entry.Type() {
		case gitobj.BlobObjectType:
			oid, err = r.rewriteBlob(commitOID, entry.Oid, fullpath, fn, perc)
		case gitobj.TreeObjectType:
			oid, err = r.rewriteTree(commitOID, entry.Oid, fullpath, fn, tpfn, tfn, perc)
		default:
			oid = entry.Oid

		}
		if err != nil {
			return nil, err
		}

		entries = append(entries, r.cacheEntry(fullpath, entry, &gitobj.TreeEntry{
			Filemode: entry.Filemode,
			Name:     entry.Name,
			Oid:      oid,
		}))
	}

	rewritten, err := tfn("/"+path, &gitobj.Tree{Entries: entries})
	if err != nil {
		return nil, err
	}

	if tree.Equal(rewritten) {
		return treeOID, nil
	}
	return r.db.WriteTree(rewritten)
}

func copyEntry(e *gitobj.TreeEntry) *gitobj.TreeEntry {
	if e == nil {
		return nil
	}

	oid := make([]byte, len(e.Oid))
	copy(oid, e.Oid)

	return &gitobj.TreeEntry{
		Filemode: e.Filemode,
		Name:     e.Name,
		Oid:      oid,
	}
}

func copyEntryMode(e *gitobj.TreeEntry, mode int32) *gitobj.TreeEntry {
	copied := copyEntry(e)
	copied.Filemode = mode

	return copied
}

func (r *Rewriter) allows(typ gitobj.ObjectType, abs string) bool {
	switch typ {
	case gitobj.BlobObjectType:
		return r.Filter().Allows(strings.TrimPrefix(abs, "/"))
	case gitobj.CommitObjectType, gitobj.TreeObjectType:
		return true
	default:
		panic(fmt.Sprintf("git/githistory: %s", tr.Tr.Get("unknown entry type: %s", typ)))
	}
}

// rewriteBlob calls the given BlobRewriteFn "fn" on a blob given in the object
// database by the SHA1 "from" []byte. It writes and returns the new blob SHA,
// or an error if either the BlobRewriteFn returned one, or if the object could
// not be loaded/saved.
func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewriteFn, perc *tasklog.PercentageTask) ([]byte, error) {
	blob, err := r.db.Blob(from)
	if err != nil {
		return nil, err
	}

	b, err := fn(path, blob)
	if err != nil {
		return nil, err
	}

	if !blob.Equal(b) {
		sha, err := r.db.WriteBlob(b)
		if err != nil {
			return nil, err
		}

		// Close the source blob, so long as it is not equal to the
		// rewritten blob. If the two are equal, as in the check above
		// this comment, calling r.db.WriteBlob(b) will have already
		// closed both "b" and "blob" since they are the same.
		//
		// Closing an *os.File twice causes an `os.ErrInvalid` to be
		// returned.
		if err = blob.Close(); err != nil {
			return nil, err
		}

		if perc != nil {
			perc.Entry(fmt.Sprintf("migrate: %s", tr.Tr.Get("commit %s: %s", hex.EncodeToString(commitOID), path)))
		}

		return sha, nil
	}

	// Close the source blob, since it is identical to the rewritten blob,
	// but neither were written.
	if err := blob.Close(); err != nil {
		return nil, err
	}
	return from, nil
}

// commitsToMigrate returns an in-memory copy of a list of commits according to
// the output of git-rev-list(1) (given the *RewriteOptions), where each
// outputted commit is 20 bytes of raw SHA1.
//
// If any error was encountered, it will be returned.
func (r *Rewriter) commitsToMigrate(opt *RewriteOptions) ([][]byte, error) {
	waiter := r.l.Waiter(fmt.Sprintf("migrate: %s", tr.Tr.Get("Sorting commits")))
	defer waiter.Complete()

	scanner, err := git.NewRevListScanner(
		opt.Include, opt.Exclude, r.scannerOpts())
	if err != nil {
		return nil, err
	}

	var commits [][]byte
	for scanner.Scan() {
		commits = append(commits, scanner.OID())
	}

	if err = scanner.Err(); err != nil {
		return nil, err
	}
	if err = scanner.Close(); err != nil {
		return nil, err
	}
	return commits, nil
}

// refsToMigrate returns a list of references to migrate, or an error if loading
// those references failed.
func (r *Rewriter) refsToMigrate() ([]*git.Ref, error) {
	var refs []*git.Ref
	var err error

	if root, ok := r.db.Root(); ok {
		refs, err = git.AllRefsIn(root)
	} else {
		refs, err = git.AllRefs()
	}

	if err != nil {
		return nil, err
	}

	var local []*git.Ref
	for _, ref := range refs {
		if ref.Type == git.RefTypeRemoteBranch {
			continue
		}

		local = append(local, ref)
	}

	return local, nil
}

// scannerOpts returns a *git.ScanRefsOptions instance to be given to the
// *git.RevListScanner.
//
// If the database this *Rewriter is operating in a given root (not in memory)
// it re-assigns the working directory to be there.
func (r *Rewriter) scannerOpts() *git.ScanRefsOptions {
	opts := &git.ScanRefsOptions{
		Mode:        git.ScanRefsMode,
		Order:       git.TopoRevListOrder,
		Reverse:     true,
		CommitsOnly: true,

		SkippedRefs: make([]string, 0),
		Mutex:       new(sync.Mutex),
		Names:       make(map[string]string),
	}

	if root, ok := r.db.Root(); ok {
		opts.WorkingDir = root
	}
	return opts
}

// Filter returns the filter used by this *Rewriter to filter subtrees, blobs
// (see above).
func (r *Rewriter) Filter() *filepathfilter.Filter {
	return r.filter
}

// cacheEntry caches then given "from" entry so that it is always rewritten as
// a *TreeEntry equivalent to "to".
func (r *Rewriter) cacheEntry(path string, from, to *gitobj.TreeEntry) *gitobj.TreeEntry {
	r.mu.Lock()
	defer r.mu.Unlock()

	r.entries[r.entryKey(path, from)] = to

	return to
}

// uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry
// "from". That is to say, it returns the *TreeEntry that "from" should be
// rewritten to, or nil if none could be found.
func (r *Rewriter) uncacheEntry(path string, from *gitobj.TreeEntry) *gitobj.TreeEntry {
	r.mu.Lock()
	defer r.mu.Unlock()

	return r.entries[r.entryKey(path, from)]
}

// entryKey returns a unique key for a given *TreeEntry "e".
func (r *Rewriter) entryKey(path string, e *gitobj.TreeEntry) string {
	return fmt.Sprintf("%s:%x", path, e.Oid)
}

// cacheEntry caches then given "from" commit so that it is always rewritten as
// a *git/gitobj.Commit equivalent to "to".
func (r *Rewriter) cacheCommit(from, to []byte) {
	r.mu.Lock()
	defer r.mu.Unlock()

	r.commits[hex.EncodeToString(from)] = to
}

// uncacheCommit returns a *git/gitobj.Commit that is cached from the given
// *git/gitobj.Commit "from". That is to say, it returns the *git/gitobj.Commit that
// "from" should be rewritten to and true, or nil and false if none could be
// found.
func (r *Rewriter) uncacheCommit(from []byte) ([]byte, bool) {
	r.mu.Lock()
	defer r.mu.Unlock()

	c, ok := r.commits[hex.EncodeToString(from)]
	return c, ok
}