From be9b214c0cfcc0560f9a7159ded870cfa94bb138 Mon Sep 17 00:00:00 2001 From: ashmrtn Date: Fri, 9 Dec 2022 15:55:45 -0800 Subject: [PATCH] Track more info about collections during backup (#1765) ## Description Track additional information about collections and their items during backup so we can properly merge directories and items in directories when doing incremental backups ## Type of change - [ ] :sunflower: Feature - [ ] :bug: Bugfix - [ ] :world_map: Documentation - [ ] :robot: Test - [ ] :computer: CI/Deployment - [x] :hamster: Trivial/Minor ## Issue(s) * #1740 ## Test Plan - [ ] :muscle: Manual - [x] :zap: Unit test - [ ] :green_heart: E2E --- src/internal/kopia/upload.go | 57 ++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/src/internal/kopia/upload.go b/src/internal/kopia/upload.go index 4b0659927..2215ed9c1 100644 --- a/src/internal/kopia/upload.go +++ b/src/internal/kopia/upload.go @@ -195,15 +195,16 @@ func collectionEntries( cb func(context.Context, fs.Entry) error, streamedEnts data.Collection, progress *corsoProgress, -) *multierror.Error { +) (map[string]struct{}, *multierror.Error) { if streamedEnts == nil { - return nil + return nil, nil } var ( errs *multierror.Error // Track which items have already been seen so we can skip them if we see // them again in the data from the base snapshot. + seen = map[string]struct{}{} items = streamedEnts.Items() log = logger.Ctx(ctx) ) @@ -212,13 +213,28 @@ func collectionEntries( select { case <-ctx.Done(): errs = multierror.Append(errs, ctx.Err()) - return errs + return seen, errs case e, ok := <-items: if !ok { - return errs + return seen, errs } + // Even if this item has been deleted and should not appear at all in + // the new snapshot we need to record that we've seen it here so we know + // to skip it if it's also present in the base snapshot. + // + // TODO(ashmrtn): Determine if we want to try to use the old version of + // the data (if it exists in the base) if we fail uploading the new + // version. If so, we should split this call into where we check for the + // item being deleted and then again after we do the kopia callback. + // + // TODO(ashmrtn): With a little more info, we could reduce the number of + // items we need to track. Namely, we can track the created time of the + // item and if it's after the base snapshot was finalized we can skip it + // because it's not possible for the base snapshot to contain that item. + seen[e.UUID()] = struct{}{} + // For now assuming that item IDs don't need escaping. itemPath, err := streamedEnts.FullPath().Append(e.UUID(), true) if err != nil { @@ -231,7 +247,11 @@ func collectionEntries( } log.Debugw("reading item", "path", itemPath.String()) - trace.Log(ctx, "kopia:collectionEntries:item", itemPath.String()) + trace.Log(ctx, "kopia:streamEntries:item", itemPath.String()) + + if e.Deleted() { + continue + } // Not all items implement StreamInfo. For example, the metadata files // do not because they don't contain information directly backed up or @@ -264,7 +284,7 @@ func collectionEntries( // Kopia's uploader swallows errors in most cases, so if we see // something here it's probably a big issue and we should return. errs = multierror.Append(errs, errors.Wrapf(err, "executing callback on %q", itemPath)) - return errs + return seen, errs } } } @@ -290,7 +310,7 @@ func getStreamItemFunc( } } - errs := collectionEntries(ctx, cb, streamedEnts, progress) + _, errs := collectionEntries(ctx, cb, streamedEnts, progress) // TODO(ashmrtn): Stream entries from base snapshot if they exist. @@ -370,21 +390,34 @@ func getTreeNode(roots map[string]*treeMap, pathElements []string) *treeMap { func inflateCollectionTree( ctx context.Context, collections []data.Collection, -) (map[string]*treeMap, *OwnersCats, error) { +) (map[string]*treeMap, map[string]path.Path, *OwnersCats, error) { roots := make(map[string]*treeMap) + // Contains the old path for collections that have been moved or renamed. + // Allows resolving what the new path should be when walking the base + // snapshot(s)'s hierarchy. Nil represents a collection that was deleted. + updatedPaths := make(map[string]path.Path) ownerCats := &OwnersCats{ ResourceOwners: make(map[string]struct{}), ServiceCats: make(map[string]struct{}), } for _, s := range collections { + switch s.State() { + case data.DeletedState: + updatedPaths[s.PreviousPath().String()] = nil + continue + + case data.MovedState: + updatedPaths[s.PreviousPath().String()] = s.FullPath() + } + if s.FullPath() == nil || len(s.FullPath().Elements()) == 0 { - return nil, nil, errors.New("no identifier for collection") + return nil, nil, nil, errors.New("no identifier for collection") } node := getTreeNode(roots, s.FullPath().Elements()) if node == nil { - return nil, nil, errors.Errorf( + return nil, nil, nil, errors.Errorf( "unable to get tree node for path %s", s.FullPath(), ) @@ -397,7 +430,7 @@ func inflateCollectionTree( node.collection = s } - return roots, ownerCats, nil + return roots, updatedPaths, ownerCats, nil } // inflateDirTree returns a set of tags representing all the resource owners and @@ -411,7 +444,7 @@ func inflateDirTree( collections []data.Collection, progress *corsoProgress, ) (fs.Directory, *OwnersCats, error) { - roots, ownerCats, err := inflateCollectionTree(ctx, collections) + roots, _, ownerCats, err := inflateCollectionTree(ctx, collections) if err != nil { return nil, nil, errors.Wrap(err, "inflating collection tree") }