Add a set of items that will be excluded from base directories during backup (#2143)

## Description

Some external services like OneDrive do not have the ability to determine the path a deleted item used to exist at, they only know the item's old ID. This patch allows Kopia Wrapper to handle those items by implementing a global exclude set. The patch assumes that items in base directories are unique as the items in every base directory are checked against the set.

This is not wired to anything outside of Kopia Wrapper. Currently this feature is disabled as the passed value is always nil.

## Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No 

## Type of change

- [x] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Test
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

## Issue(s)

* closes #2121 

## Test Plan

- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-01-25 08:28:09 -08:00 committed by GitHub
parent 91a9077d90
commit 46d61c7246
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 94 additions and 17 deletions

View File

@ -345,6 +345,7 @@ func streamBaseEntries(
prevPath path.Path,
dir fs.Directory,
encodedSeen map[string]struct{},
globalExcludeSet map[string]struct{},
progress *corsoProgress,
) error {
if dir == nil {
@ -373,6 +374,12 @@ func streamBaseEntries(
return errors.Wrapf(err, "unable to decode entry name %s", entry.Name())
}
// This entry was marked as deleted by a service that can't tell us the
// previous path of deleted items, only the item ID.
if _, ok := globalExcludeSet[entName]; ok {
return nil
}
// For now assuming that item IDs don't need escaping.
itemPath, err := curPath.Append(entName, true)
if err != nil {
@ -421,6 +428,7 @@ func getStreamItemFunc(
staticEnts []fs.Entry,
streamedEnts data.Collection,
baseDir fs.Directory,
globalExcludeSet map[string]struct{},
progress *corsoProgress,
) func(context.Context, func(context.Context, fs.Entry) error) error {
return func(ctx context.Context, cb func(context.Context, fs.Entry) error) error {
@ -443,6 +451,7 @@ func getStreamItemFunc(
prevPath,
baseDir,
seen,
globalExcludeSet,
progress,
); err != nil {
errs = multierror.Append(
@ -457,21 +466,22 @@ func getStreamItemFunc(
// buildKopiaDirs recursively builds a directory hierarchy from the roots up.
// Returned directories are virtualfs.StreamingDirectory.
func buildKopiaDirs(dirName string, dir *treeMap, progress *corsoProgress) (fs.Directory, error) {
func buildKopiaDirs(
dirName string,
dir *treeMap,
globalExcludeSet map[string]struct{},
progress *corsoProgress,
) (fs.Directory, error) {
// Reuse kopia directories directly if the subtree rooted at them is
// unchanged.
//
// TODO(ashmrtn): This will need updated when we have OneDrive backups where
// items have been deleted because we can't determine which directory used to
// have the item.
//
// TODO(ashmrtn): We could possibly also use this optimization if we know that
// the collection has no items in it. In that case though, we may need to take
// extra care to ensure the name of the directory is properly represented. For
// example, a directory that has been renamed but with no additional items may
// not be able to directly use kopia's version of the directory due to the
// rename.
if dir.collection == nil && len(dir.childDirs) == 0 && dir.baseDir != nil {
if dir.collection == nil && len(dir.childDirs) == 0 && dir.baseDir != nil && len(globalExcludeSet) == 0 {
return dir.baseDir, nil
}
@ -480,7 +490,7 @@ func buildKopiaDirs(dirName string, dir *treeMap, progress *corsoProgress) (fs.D
var childDirs []fs.Entry
for childName, childDir := range dir.childDirs {
child, err := buildKopiaDirs(childName, childDir, progress)
child, err := buildKopiaDirs(childName, childDir, globalExcludeSet, progress)
if err != nil {
return nil, err
}
@ -496,6 +506,7 @@ func buildKopiaDirs(dirName string, dir *treeMap, progress *corsoProgress) (fs.D
childDirs,
dir.collection,
dir.baseDir,
globalExcludeSet,
progress,
),
), nil
@ -879,11 +890,19 @@ func inflateBaseTree(
// virtualfs.StreamingDirectory with the given DataCollections if there is one
// for that node. Tags can be used in future backups to fetch old snapshots for
// caching reasons.
//
// globalExcludeSet represents a set of items, represented with file names, to
// exclude from base directories when uploading the snapshot. As items in *all*
// base directories will be checked for in every base directory, this assumes
// that items in the bases are unique. Deletions of directories or subtrees
// should be represented as changes in the status of a Collection, not an entry
// in the globalExcludeSet.
func inflateDirTree(
ctx context.Context,
loader snapshotLoader,
baseSnaps []IncrementalBase,
collections []data.Collection,
globalExcludeSet map[string]struct{},
progress *corsoProgress,
) (fs.Directory, error) {
roots, updatedPaths, err := inflateCollectionTree(ctx, collections)
@ -915,7 +934,7 @@ func inflateDirTree(
var res fs.Directory
for dirName, dir := range roots {
tmp, err := buildKopiaDirs(dirName, dir, progress)
tmp, err := buildKopiaDirs(dirName, dir, globalExcludeSet, progress)
if err != nil {
return nil, err
}

View File

@ -705,7 +705,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTree() {
// - emails
// - Inbox
// - 42 separate files
dirTree, err := inflateDirTree(ctx, nil, nil, collections, progress)
dirTree, err := inflateDirTree(ctx, nil, nil, collections, nil, progress)
require.NoError(t, err)
assert.Equal(t, encodeAsPath(testTenant), dirTree.Name())
@ -793,7 +793,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTree_MixedDirectory()
suite.T().Run(test.name, func(t *testing.T) {
progress := &corsoProgress{pending: map[string]*itemDetails{}}
dirTree, err := inflateDirTree(ctx, nil, nil, test.layout, progress)
dirTree, err := inflateDirTree(ctx, nil, nil, test.layout, nil, progress)
require.NoError(t, err)
assert.Equal(t, encodeAsPath(testTenant), dirTree.Name())
@ -889,7 +889,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTree_Fails() {
defer flush()
suite.T().Run(test.name, func(t *testing.T) {
_, err := inflateDirTree(ctx, nil, nil, test.layout, nil)
_, err := inflateDirTree(ctx, nil, nil, test.layout, nil, nil)
assert.Error(t, err)
})
}
@ -992,7 +992,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeErrors() {
cols = append(cols, mc)
}
_, err := inflateDirTree(ctx, nil, nil, cols, progress)
_, err := inflateDirTree(ctx, nil, nil, cols, nil, progress)
require.Error(t, err)
})
}
@ -1261,6 +1261,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeSingleSubtree() {
mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory),
},
test.inputCollections(),
nil,
progress,
)
require.NoError(t, err)
@ -1281,7 +1282,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
[]string{testTenant, service, testUser, category, testInboxDir},
false,
)
inboxFileName1 := testFileName4
inboxFileName1 := testFileName
inboxFileData1 := testFileData4
inboxFileName2 := testFileName5
inboxFileData2 := testFileData5
@ -1291,7 +1292,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
append(inboxPath.Elements(), personalDir),
false,
)
personalFileName1 := testFileName
personalFileName1 := inboxFileName1
personalFileName2 := testFileName2
workPath := makePath(
@ -1312,7 +1313,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
// - user1
// - email
// - Inbox
// - file4
// - file1
// - personal
// - file1
// - file2
@ -1369,8 +1370,51 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
table := []struct {
name string
inputCollections func(t *testing.T) []data.Collection
inputExcludes map[string]struct{}
expected *expectedNode
}{
{
name: "GlobalExcludeSet",
inputCollections: func(t *testing.T) []data.Collection {
return nil
},
inputExcludes: map[string]struct{}{
inboxFileName1: {},
},
expected: expectedTreeWithChildren(
[]string{
testTenant,
service,
testUser,
category,
},
[]*expectedNode{
{
name: testInboxDir,
children: []*expectedNode{
{
name: personalDir,
children: []*expectedNode{
{
name: personalFileName2,
children: []*expectedNode{},
},
},
},
{
name: workDir,
children: []*expectedNode{
{
name: workFileName1,
children: []*expectedNode{},
},
},
},
},
},
},
),
},
{
name: "MovesSubtree",
inputCollections: func(t *testing.T) []data.Collection {
@ -1919,6 +1963,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory),
},
test.inputCollections(t),
test.inputExcludes,
progress,
)
require.NoError(t, err)
@ -2079,6 +2124,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeSkipsDeletedSubtre
mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory),
},
collections,
nil,
progress,
)
require.NoError(t, err)
@ -2325,6 +2371,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeSelectsCorrectSubt
mockIncrementalBase("id2", testTenant, testUser, path.ExchangeService, path.EmailCategory),
},
collections,
nil,
progress,
)
require.NoError(t, err)

View File

@ -129,7 +129,11 @@ func (w Wrapper) BackupCollections(
ctx, end := D.Span(ctx, "kopia:backupCollections")
defer end()
if len(collections) == 0 {
// TODO(ashmrtn): Make this a parameter when actually enabling the global
// exclude set.
var globalExcludeSet map[string]struct{}
if len(collections) == 0 && len(globalExcludeSet) == 0 {
return &BackupStats{}, &details.Builder{}, nil, nil
}
@ -147,7 +151,14 @@ func (w Wrapper) BackupCollections(
base = previousSnapshots
}
dirTree, err := inflateDirTree(ctx, w.c, base, collections, progress)
dirTree, err := inflateDirTree(
ctx,
w.c,
base,
collections,
globalExcludeSet,
progress,
)
if err != nil {
return nil, nil, nil, errors.Wrap(err, "building kopia directories")
}