1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
|
package githistory
import (
"encoding/hex"
"fmt"
"os"
"strings"
"sync"
"github.com/git-lfs/git-lfs/v3/errors"
"github.com/git-lfs/git-lfs/v3/filepathfilter"
"github.com/git-lfs/git-lfs/v3/git"
"github.com/git-lfs/git-lfs/v3/tasklog"
"github.com/git-lfs/git-lfs/v3/tr"
"github.com/git-lfs/gitobj/v2"
)
// Rewriter allows rewriting topologically equivalent Git histories
// between two revisions.
type Rewriter struct {
// mu guards entries and commits (see below)
mu *sync.Mutex
// entries is a mapping of old tree entries to new (rewritten) ones.
// Since TreeEntry contains a []byte (and is therefore not a key-able
// type), a unique TreeEntry -> string function is used for map keys.
entries map[string]*gitobj.TreeEntry
// commits is a mapping of old commit SHAs to new ones, where the ASCII
// hex encoding of the SHA1 values are used as map keys.
commits map[string][]byte
// filter is an optional value used to specify which tree entries
// (blobs, subtrees) are modifiable given a BlobFn. If non-nil, this
// filter will cull out any unmodifiable subtrees and blobs.
filter *filepathfilter.Filter
// db is the *ObjectDatabase from which blobs, commits, and trees are
// loaded from.
db *gitobj.ObjectDatabase
// l is the *tasklog.Logger to which updates are written.
l *tasklog.Logger
}
// RewriteOptions is an options type given to the Rewrite() function.
type RewriteOptions struct {
// Include is the list of refs of which commits reachable by that ref
// will be included.
Include []string
// Exclude is the list of refs of which commits reachable by that ref
// will be excluded.
Exclude []string
// UpdateRefs specifies whether the Rewriter should move refs from the
// original graph onto the migrated one. If true, the refs will be
// moved, and a reflog entry will be created.
UpdateRefs bool
// Verbose mode prints migrated objects.
Verbose bool
// ObjectMapFilePath is the path to the map of old sha1 to new sha1
// commits
ObjectMapFilePath string
// BlobFn specifies a function to rewrite blobs.
//
// It is called once per unique, unchanged path. That is to say, if
// /a/foo and /a/bar contain identical contents, the BlobFn will be
// called twice: once for /a/foo and once for /a/bar, but no more on
// each blob for subsequent revisions, so long as each entry remains
// unchanged.
BlobFn BlobRewriteFn
// TreePreCallbackFn specifies a function to be called before opening a
// tree for rewriting. It will be called on all trees throughout history
// in topological ordering through the tree, starting at the root.
TreePreCallbackFn TreePreCallbackFn
// TreeCallbackFn specifies a function to rewrite trees after they have
// been reassembled by calling the above BlobFn on all existing tree
// entries.
TreeCallbackFn TreeCallbackFn
}
// blobFn returns a usable BlobRewriteFn, either the one that was given in the
// *RewriteOptions, or a noopBlobFn.
func (r *RewriteOptions) blobFn() BlobRewriteFn {
if r.BlobFn == nil {
return noopBlobFn
}
return r.BlobFn
}
// treePreFn returns a usable TreePreCallbackFn, either the one that was given
// in the *RewriteOptions, or a noopTreePreFn.
func (r *RewriteOptions) treePreFn() TreePreCallbackFn {
if r.TreePreCallbackFn == nil {
return noopTreePreFn
}
return r.TreePreCallbackFn
}
// treeFn returns a usable TreeRewriteFn, either the one that was given in the
// *RewriteOptions, or a noopTreeFn.
func (r *RewriteOptions) treeFn() TreeCallbackFn {
if r.TreeCallbackFn == nil {
return noopTreeFn
}
return r.TreeCallbackFn
}
// BlobRewriteFn is a mapping function that takes a given blob and returns a
// new, modified blob. If it returns an error, the new blob will not be written
// and instead the error will be returned from the Rewrite() function.
//
// Invocations of an instance of BlobRewriteFn are not expected to store the
// returned blobs in the *git/gitobj.ObjectDatabase.
//
// The path argument is given to be an absolute path to the tree entry being
// rewritten, where the repository root is the root of the path given. For
// instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt",
// where as a file "a.txt" in the root would be given as "/a.txt".
//
// As above, the path separators are OS specific, and equivalent to the result
// of filepath.Join(...) or os.PathSeparator.
type BlobRewriteFn func(path string, b *gitobj.Blob) (*gitobj.Blob, error)
// TreePreCallbackFn specifies a function to call upon opening a new tree for
// rewriting.
//
// Unlike its sibling TreeCallbackFn, TreePreCallbackFn may not modify the given
// tree.
//
// TreePreCallbackFn can be nil, and will therefore exhibit behavior equivalent
// to only calling the BlobFn on existing tree entries.
//
// If the TreePreCallbackFn returns an error, it will be returned from the
// Rewrite() invocation.
type TreePreCallbackFn func(path string, t *gitobj.Tree) error
// TreeCallbackFn specifies a function to call before writing a re-written tree
// to the object database. The TreeCallbackFn can return a modified tree to be
// written to the object database instead of one generated from calling BlobFn
// on all of the tree entries.
//
// TreeCallbackFn can be nil, and will therefore exhibit behavior equivalent to
// only calling the BlobFn on existing tree entries.
//
// If the TreeCallbackFn returns an error, it will be returned from the
// Rewrite() invocation.
type TreeCallbackFn func(path string, t *gitobj.Tree) (*gitobj.Tree, error)
type rewriterOption func(*Rewriter)
var (
// WithFilter is an optional argument given to the NewRewriter
// constructor function to limit invocations of the BlobRewriteFn to
// only pathspecs that match the given *filepathfilter.Filter.
WithFilter = func(filter *filepathfilter.Filter) rewriterOption {
return func(r *Rewriter) {
r.filter = filter
}
}
// WithLogger logs updates caused by the *git/githistory.Rewriter to the
// be given to the provided logger, "l".
WithLogger = func(l *tasklog.Logger) rewriterOption {
return func(r *Rewriter) {
r.l = l
}
}
// noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns
// the blob that it was given, and returns no error.
noopBlobFn = func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }
// noopTreePreFn is a no-op implementation of the TreePreRewriteFn. It
// returns the tree that it was given, and returns no error.
noopTreePreFn = func(path string, t *gitobj.Tree) error { return nil }
// noopTreeFn is a no-op implementation of the TreeRewriteFn. It returns
// the tree that it was given, and returns no error.
noopTreeFn = func(path string, t *gitobj.Tree) (*gitobj.Tree, error) { return t, nil }
)
// NewRewriter constructs a *Rewriter from the given *ObjectDatabase instance.
func NewRewriter(db *gitobj.ObjectDatabase, opts ...rewriterOption) *Rewriter {
rewriter := &Rewriter{
mu: new(sync.Mutex),
entries: make(map[string]*gitobj.TreeEntry),
commits: make(map[string][]byte),
db: db,
}
for _, opt := range opts {
opt(rewriter)
}
return rewriter
}
// Rewrite rewrites the range of commits given by
// *RewriteOptions.{Include,Exclude} using the BlobRewriteFn to rewrite
// the individual blobs.
func (r *Rewriter) Rewrite(opt *RewriteOptions) ([]byte, error) {
// First, obtain a list of commits to rewrite.
commits, err := r.commitsToMigrate(opt)
if err != nil {
return nil, err
}
var perc *tasklog.PercentageTask
if opt.UpdateRefs {
perc = r.l.Percentage(fmt.Sprintf("migrate: %s", tr.Tr.Get("Rewriting commits")), uint64(len(commits)))
} else {
perc = r.l.Percentage(fmt.Sprintf("migrate: %s", tr.Tr.Get("Examining commits")), uint64(len(commits)))
}
defer perc.Complete()
var vPerc *tasklog.PercentageTask
if opt.Verbose {
vPerc = perc
}
var objectMapFile *os.File
if len(opt.ObjectMapFilePath) > 0 {
objectMapFile, err = os.OpenFile(opt.ObjectMapFilePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
if err != nil {
return nil, errors.New(tr.Tr.Get("could not create object map file: %v", err))
}
defer objectMapFile.Close()
}
// Keep track of the last commit that we rewrote. Callers often want
// this so that they can perform a git-update-ref(1).
var tip []byte
for _, oid := range commits {
// Load the original commit to access the data necessary in
// order to rewrite it.
original, err := r.db.Commit(oid)
if err != nil {
return nil, err
}
// Rewrite the tree given at that commit.
rewrittenTree, err := r.rewriteTree(oid, original.TreeID, "", opt.blobFn(), opt.treePreFn(), opt.treeFn(), vPerc)
if err != nil {
return nil, err
}
// Create a new list of parents from the original commit to
// point at the rewritten parents in order to create a
// topologically equivalent DAG.
//
// This operation is safe since we are visiting the commits in
// reverse topological order and therefore have seen all parents
// before children (in other words, r.uncacheCommit(...) will
// always return a value, if the prospective parent is a part of
// the migration).
rewrittenParents := make([][]byte, 0, len(original.ParentIDs))
for _, originalParent := range original.ParentIDs {
rewrittenParent, ok := r.uncacheCommit(originalParent)
if !ok {
// If we haven't seen the parent before, this
// means that we're doing a partial migration
// and the parent that we're looking for isn't
// included.
//
// Use the original parent to properly link
// history across the migration boundary.
rewrittenParent = originalParent
}
rewrittenParents = append(rewrittenParents, rewrittenParent)
}
// Construct a new commit using the original header information,
// but the rewritten set of parents as well as root tree.
rewrittenCommit := &gitobj.Commit{
Author: original.Author,
Committer: original.Committer,
ExtraHeaders: original.ExtraHeaders,
Message: original.Message,
ParentIDs: rewrittenParents,
TreeID: rewrittenTree,
}
var newSha []byte
if original.Equal(rewrittenCommit) {
newSha = make([]byte, len(oid))
copy(newSha, oid)
} else {
newSha, err = r.db.WriteCommit(rewrittenCommit)
if err != nil {
return nil, err
}
if objectMapFile != nil {
if _, err := fmt.Fprintf(objectMapFile, "%x,%x\n", oid, newSha); err != nil {
return nil, err
}
}
}
// Cache that commit so that we can reassign children of this
// commit.
r.cacheCommit(oid, newSha)
// Increment the percentage displayed in the terminal.
perc.Count(1)
// Move the tip forward.
tip = newSha
}
if opt.UpdateRefs {
refs, err := r.refsToMigrate()
if err != nil {
return nil, errors.Wrap(err, tr.Tr.Get("could not find refs to update"))
}
root, _ := r.db.Root()
updater := &refUpdater{
CacheFn: r.uncacheCommit,
Logger: r.l,
Refs: refs,
Root: root,
db: r.db,
}
if err := updater.UpdateRefs(); err != nil {
return nil, errors.Wrap(err, tr.Tr.Get("could not update refs"))
}
}
return tip, err
}
// rewriteTree is a recursive function which rewrites a tree given by the ID
// "sha" and path "path". It uses the given BlobRewriteFn to rewrite all blobs
// within the tree, either calling that function or recurring down into subtrees
// by re-assigning the SHA.
//
// Once it is done assembling the entries in a given subtree, it then calls the
// TreeCallbackFn, "tfn" to perform a final traversal of the subtree before
// saving it to the object database.
//
// It returns the new SHA of the rewritten tree, or an error if the tree was
// unable to be rewritten.
func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string,
fn BlobRewriteFn, tpfn TreePreCallbackFn, tfn TreeCallbackFn,
perc *tasklog.PercentageTask) ([]byte, error) {
tree, err := r.db.Tree(treeOID)
if err != nil {
return nil, err
}
if err := tpfn("/"+path, tree); err != nil {
return nil, err
}
entries := make([]*gitobj.TreeEntry, 0, len(tree.Entries))
for _, entry := range tree.Entries {
var fullpath string
if len(path) > 0 {
fullpath = strings.Join([]string{path, entry.Name}, "/")
} else {
fullpath = entry.Name
}
if !r.allows(entry.Type(), fullpath) {
entries = append(entries, copyEntry(entry))
continue
}
// If this is a symlink, skip it
if entry.Filemode == 0120000 {
entries = append(entries, copyEntry(entry))
continue
}
if cached := r.uncacheEntry(fullpath, entry); cached != nil {
entries = append(entries, copyEntryMode(cached,
entry.Filemode))
continue
}
var oid []byte
switch entry.Type() {
case gitobj.BlobObjectType:
oid, err = r.rewriteBlob(commitOID, entry.Oid, fullpath, fn, perc)
case gitobj.TreeObjectType:
oid, err = r.rewriteTree(commitOID, entry.Oid, fullpath, fn, tpfn, tfn, perc)
default:
oid = entry.Oid
}
if err != nil {
return nil, err
}
entries = append(entries, r.cacheEntry(fullpath, entry, &gitobj.TreeEntry{
Filemode: entry.Filemode,
Name: entry.Name,
Oid: oid,
}))
}
rewritten, err := tfn("/"+path, &gitobj.Tree{Entries: entries})
if err != nil {
return nil, err
}
if tree.Equal(rewritten) {
return treeOID, nil
}
return r.db.WriteTree(rewritten)
}
func copyEntry(e *gitobj.TreeEntry) *gitobj.TreeEntry {
if e == nil {
return nil
}
oid := make([]byte, len(e.Oid))
copy(oid, e.Oid)
return &gitobj.TreeEntry{
Filemode: e.Filemode,
Name: e.Name,
Oid: oid,
}
}
func copyEntryMode(e *gitobj.TreeEntry, mode int32) *gitobj.TreeEntry {
copied := copyEntry(e)
copied.Filemode = mode
return copied
}
func (r *Rewriter) allows(typ gitobj.ObjectType, abs string) bool {
switch typ {
case gitobj.BlobObjectType:
return r.Filter().Allows(strings.TrimPrefix(abs, "/"))
case gitobj.CommitObjectType, gitobj.TreeObjectType:
return true
default:
panic(fmt.Sprintf("git/githistory: %s", tr.Tr.Get("unknown entry type: %s", typ)))
}
}
// rewriteBlob calls the given BlobRewriteFn "fn" on a blob given in the object
// database by the SHA1 "from" []byte. It writes and returns the new blob SHA,
// or an error if either the BlobRewriteFn returned one, or if the object could
// not be loaded/saved.
func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewriteFn, perc *tasklog.PercentageTask) ([]byte, error) {
blob, err := r.db.Blob(from)
if err != nil {
return nil, err
}
b, err := fn(path, blob)
if err != nil {
return nil, err
}
if !blob.Equal(b) {
sha, err := r.db.WriteBlob(b)
if err != nil {
return nil, err
}
// Close the source blob, so long as it is not equal to the
// rewritten blob. If the two are equal, as in the check above
// this comment, calling r.db.WriteBlob(b) will have already
// closed both "b" and "blob" since they are the same.
//
// Closing an *os.File twice causes an `os.ErrInvalid` to be
// returned.
if err = blob.Close(); err != nil {
return nil, err
}
if perc != nil {
perc.Entry(fmt.Sprintf("migrate: %s", tr.Tr.Get("commit %s: %s", hex.EncodeToString(commitOID), path)))
}
return sha, nil
}
// Close the source blob, since it is identical to the rewritten blob,
// but neither were written.
if err := blob.Close(); err != nil {
return nil, err
}
return from, nil
}
// commitsToMigrate returns an in-memory copy of a list of commits according to
// the output of git-rev-list(1) (given the *RewriteOptions), where each
// outputted commit is 20 bytes of raw SHA1.
//
// If any error was encountered, it will be returned.
func (r *Rewriter) commitsToMigrate(opt *RewriteOptions) ([][]byte, error) {
waiter := r.l.Waiter(fmt.Sprintf("migrate: %s", tr.Tr.Get("Sorting commits")))
defer waiter.Complete()
scanner, err := git.NewRevListScanner(
opt.Include, opt.Exclude, r.scannerOpts())
if err != nil {
return nil, err
}
var commits [][]byte
for scanner.Scan() {
commits = append(commits, scanner.OID())
}
if err = scanner.Err(); err != nil {
return nil, err
}
if err = scanner.Close(); err != nil {
return nil, err
}
return commits, nil
}
// refsToMigrate returns a list of references to migrate, or an error if loading
// those references failed.
func (r *Rewriter) refsToMigrate() ([]*git.Ref, error) {
var refs []*git.Ref
var err error
if root, ok := r.db.Root(); ok {
refs, err = git.AllRefsIn(root)
} else {
refs, err = git.AllRefs()
}
if err != nil {
return nil, err
}
var local []*git.Ref
for _, ref := range refs {
if ref.Type == git.RefTypeRemoteBranch {
continue
}
local = append(local, ref)
}
return local, nil
}
// scannerOpts returns a *git.ScanRefsOptions instance to be given to the
// *git.RevListScanner.
//
// If the database this *Rewriter is operating in a given root (not in memory)
// it re-assigns the working directory to be there.
func (r *Rewriter) scannerOpts() *git.ScanRefsOptions {
opts := &git.ScanRefsOptions{
Mode: git.ScanRefsMode,
Order: git.TopoRevListOrder,
Reverse: true,
CommitsOnly: true,
SkippedRefs: make([]string, 0),
Mutex: new(sync.Mutex),
Names: make(map[string]string),
}
if root, ok := r.db.Root(); ok {
opts.WorkingDir = root
}
return opts
}
// Filter returns the filter used by this *Rewriter to filter subtrees, blobs
// (see above).
func (r *Rewriter) Filter() *filepathfilter.Filter {
return r.filter
}
// cacheEntry caches then given "from" entry so that it is always rewritten as
// a *TreeEntry equivalent to "to".
func (r *Rewriter) cacheEntry(path string, from, to *gitobj.TreeEntry) *gitobj.TreeEntry {
r.mu.Lock()
defer r.mu.Unlock()
r.entries[r.entryKey(path, from)] = to
return to
}
// uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry
// "from". That is to say, it returns the *TreeEntry that "from" should be
// rewritten to, or nil if none could be found.
func (r *Rewriter) uncacheEntry(path string, from *gitobj.TreeEntry) *gitobj.TreeEntry {
r.mu.Lock()
defer r.mu.Unlock()
return r.entries[r.entryKey(path, from)]
}
// entryKey returns a unique key for a given *TreeEntry "e".
func (r *Rewriter) entryKey(path string, e *gitobj.TreeEntry) string {
return fmt.Sprintf("%s:%x", path, e.Oid)
}
// cacheEntry caches then given "from" commit so that it is always rewritten as
// a *git/gitobj.Commit equivalent to "to".
func (r *Rewriter) cacheCommit(from, to []byte) {
r.mu.Lock()
defer r.mu.Unlock()
r.commits[hex.EncodeToString(from)] = to
}
// uncacheCommit returns a *git/gitobj.Commit that is cached from the given
// *git/gitobj.Commit "from". That is to say, it returns the *git/gitobj.Commit that
// "from" should be rewritten to and true, or nil and false if none could be
// found.
func (r *Rewriter) uncacheCommit(from []byte) ([]byte, bool) {
r.mu.Lock()
defer r.mu.Unlock()
c, ok := r.commits[hex.EncodeToString(from)]
return c, ok
}
|