Add a set of items that will be excluded from base directories during backup (#2143)

## Description

Some external services like OneDrive do not have the ability to determine the path a deleted item used to exist at, they only know the item's old ID. This patch allows Kopia Wrapper to handle those items by implementing a global exclude set. The patch assumes that items in base directories are unique as the items in every base directory are checked against the set.

This is not wired to anything outside of Kopia Wrapper. Currently this feature is disabled as the passed value is always nil.

## Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No 

## Type of change

- [x] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Test
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

## Issue(s)

* closes #2121 

## Test Plan

- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-01-25 08:28:09 -08:00 committed by GitHub
parent 91a9077d90
commit 46d61c7246
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 94 additions and 17 deletions

View File

@ -345,6 +345,7 @@ func streamBaseEntries(
prevPath path.Path, prevPath path.Path,
dir fs.Directory, dir fs.Directory,
encodedSeen map[string]struct{}, encodedSeen map[string]struct{},
globalExcludeSet map[string]struct{},
progress *corsoProgress, progress *corsoProgress,
) error { ) error {
if dir == nil { if dir == nil {
@ -373,6 +374,12 @@ func streamBaseEntries(
return errors.Wrapf(err, "unable to decode entry name %s", entry.Name()) return errors.Wrapf(err, "unable to decode entry name %s", entry.Name())
} }
// This entry was marked as deleted by a service that can't tell us the
// previous path of deleted items, only the item ID.
if _, ok := globalExcludeSet[entName]; ok {
return nil
}
// For now assuming that item IDs don't need escaping. // For now assuming that item IDs don't need escaping.
itemPath, err := curPath.Append(entName, true) itemPath, err := curPath.Append(entName, true)
if err != nil { if err != nil {
@ -421,6 +428,7 @@ func getStreamItemFunc(
staticEnts []fs.Entry, staticEnts []fs.Entry,
streamedEnts data.Collection, streamedEnts data.Collection,
baseDir fs.Directory, baseDir fs.Directory,
globalExcludeSet map[string]struct{},
progress *corsoProgress, progress *corsoProgress,
) func(context.Context, func(context.Context, fs.Entry) error) error { ) func(context.Context, func(context.Context, fs.Entry) error) error {
return func(ctx context.Context, cb func(context.Context, fs.Entry) error) error { return func(ctx context.Context, cb func(context.Context, fs.Entry) error) error {
@ -443,6 +451,7 @@ func getStreamItemFunc(
prevPath, prevPath,
baseDir, baseDir,
seen, seen,
globalExcludeSet,
progress, progress,
); err != nil { ); err != nil {
errs = multierror.Append( errs = multierror.Append(
@ -457,21 +466,22 @@ func getStreamItemFunc(
// buildKopiaDirs recursively builds a directory hierarchy from the roots up. // buildKopiaDirs recursively builds a directory hierarchy from the roots up.
// Returned directories are virtualfs.StreamingDirectory. // Returned directories are virtualfs.StreamingDirectory.
func buildKopiaDirs(dirName string, dir *treeMap, progress *corsoProgress) (fs.Directory, error) { func buildKopiaDirs(
dirName string,
dir *treeMap,
globalExcludeSet map[string]struct{},
progress *corsoProgress,
) (fs.Directory, error) {
// Reuse kopia directories directly if the subtree rooted at them is // Reuse kopia directories directly if the subtree rooted at them is
// unchanged. // unchanged.
// //
// TODO(ashmrtn): This will need updated when we have OneDrive backups where
// items have been deleted because we can't determine which directory used to
// have the item.
//
// TODO(ashmrtn): We could possibly also use this optimization if we know that // TODO(ashmrtn): We could possibly also use this optimization if we know that
// the collection has no items in it. In that case though, we may need to take // the collection has no items in it. In that case though, we may need to take
// extra care to ensure the name of the directory is properly represented. For // extra care to ensure the name of the directory is properly represented. For
// example, a directory that has been renamed but with no additional items may // example, a directory that has been renamed but with no additional items may
// not be able to directly use kopia's version of the directory due to the // not be able to directly use kopia's version of the directory due to the
// rename. // rename.
if dir.collection == nil && len(dir.childDirs) == 0 && dir.baseDir != nil { if dir.collection == nil && len(dir.childDirs) == 0 && dir.baseDir != nil && len(globalExcludeSet) == 0 {
return dir.baseDir, nil return dir.baseDir, nil
} }
@ -480,7 +490,7 @@ func buildKopiaDirs(dirName string, dir *treeMap, progress *corsoProgress) (fs.D
var childDirs []fs.Entry var childDirs []fs.Entry
for childName, childDir := range dir.childDirs { for childName, childDir := range dir.childDirs {
child, err := buildKopiaDirs(childName, childDir, progress) child, err := buildKopiaDirs(childName, childDir, globalExcludeSet, progress)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -496,6 +506,7 @@ func buildKopiaDirs(dirName string, dir *treeMap, progress *corsoProgress) (fs.D
childDirs, childDirs,
dir.collection, dir.collection,
dir.baseDir, dir.baseDir,
globalExcludeSet,
progress, progress,
), ),
), nil ), nil
@ -879,11 +890,19 @@ func inflateBaseTree(
// virtualfs.StreamingDirectory with the given DataCollections if there is one // virtualfs.StreamingDirectory with the given DataCollections if there is one
// for that node. Tags can be used in future backups to fetch old snapshots for // for that node. Tags can be used in future backups to fetch old snapshots for
// caching reasons. // caching reasons.
//
// globalExcludeSet represents a set of items, represented with file names, to
// exclude from base directories when uploading the snapshot. As items in *all*
// base directories will be checked for in every base directory, this assumes
// that items in the bases are unique. Deletions of directories or subtrees
// should be represented as changes in the status of a Collection, not an entry
// in the globalExcludeSet.
func inflateDirTree( func inflateDirTree(
ctx context.Context, ctx context.Context,
loader snapshotLoader, loader snapshotLoader,
baseSnaps []IncrementalBase, baseSnaps []IncrementalBase,
collections []data.Collection, collections []data.Collection,
globalExcludeSet map[string]struct{},
progress *corsoProgress, progress *corsoProgress,
) (fs.Directory, error) { ) (fs.Directory, error) {
roots, updatedPaths, err := inflateCollectionTree(ctx, collections) roots, updatedPaths, err := inflateCollectionTree(ctx, collections)
@ -915,7 +934,7 @@ func inflateDirTree(
var res fs.Directory var res fs.Directory
for dirName, dir := range roots { for dirName, dir := range roots {
tmp, err := buildKopiaDirs(dirName, dir, progress) tmp, err := buildKopiaDirs(dirName, dir, globalExcludeSet, progress)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -705,7 +705,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTree() {
// - emails // - emails
// - Inbox // - Inbox
// - 42 separate files // - 42 separate files
dirTree, err := inflateDirTree(ctx, nil, nil, collections, progress) dirTree, err := inflateDirTree(ctx, nil, nil, collections, nil, progress)
require.NoError(t, err) require.NoError(t, err)
assert.Equal(t, encodeAsPath(testTenant), dirTree.Name()) assert.Equal(t, encodeAsPath(testTenant), dirTree.Name())
@ -793,7 +793,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTree_MixedDirectory()
suite.T().Run(test.name, func(t *testing.T) { suite.T().Run(test.name, func(t *testing.T) {
progress := &corsoProgress{pending: map[string]*itemDetails{}} progress := &corsoProgress{pending: map[string]*itemDetails{}}
dirTree, err := inflateDirTree(ctx, nil, nil, test.layout, progress) dirTree, err := inflateDirTree(ctx, nil, nil, test.layout, nil, progress)
require.NoError(t, err) require.NoError(t, err)
assert.Equal(t, encodeAsPath(testTenant), dirTree.Name()) assert.Equal(t, encodeAsPath(testTenant), dirTree.Name())
@ -889,7 +889,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTree_Fails() {
defer flush() defer flush()
suite.T().Run(test.name, func(t *testing.T) { suite.T().Run(test.name, func(t *testing.T) {
_, err := inflateDirTree(ctx, nil, nil, test.layout, nil) _, err := inflateDirTree(ctx, nil, nil, test.layout, nil, nil)
assert.Error(t, err) assert.Error(t, err)
}) })
} }
@ -992,7 +992,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeErrors() {
cols = append(cols, mc) cols = append(cols, mc)
} }
_, err := inflateDirTree(ctx, nil, nil, cols, progress) _, err := inflateDirTree(ctx, nil, nil, cols, nil, progress)
require.Error(t, err) require.Error(t, err)
}) })
} }
@ -1261,6 +1261,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeSingleSubtree() {
mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory), mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory),
}, },
test.inputCollections(), test.inputCollections(),
nil,
progress, progress,
) )
require.NoError(t, err) require.NoError(t, err)
@ -1281,7 +1282,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
[]string{testTenant, service, testUser, category, testInboxDir}, []string{testTenant, service, testUser, category, testInboxDir},
false, false,
) )
inboxFileName1 := testFileName4 inboxFileName1 := testFileName
inboxFileData1 := testFileData4 inboxFileData1 := testFileData4
inboxFileName2 := testFileName5 inboxFileName2 := testFileName5
inboxFileData2 := testFileData5 inboxFileData2 := testFileData5
@ -1291,7 +1292,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
append(inboxPath.Elements(), personalDir), append(inboxPath.Elements(), personalDir),
false, false,
) )
personalFileName1 := testFileName personalFileName1 := inboxFileName1
personalFileName2 := testFileName2 personalFileName2 := testFileName2
workPath := makePath( workPath := makePath(
@ -1312,7 +1313,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
// - user1 // - user1
// - email // - email
// - Inbox // - Inbox
// - file4 // - file1
// - personal // - personal
// - file1 // - file1
// - file2 // - file2
@ -1369,8 +1370,51 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
table := []struct { table := []struct {
name string name string
inputCollections func(t *testing.T) []data.Collection inputCollections func(t *testing.T) []data.Collection
inputExcludes map[string]struct{}
expected *expectedNode expected *expectedNode
}{ }{
{
name: "GlobalExcludeSet",
inputCollections: func(t *testing.T) []data.Collection {
return nil
},
inputExcludes: map[string]struct{}{
inboxFileName1: {},
},
expected: expectedTreeWithChildren(
[]string{
testTenant,
service,
testUser,
category,
},
[]*expectedNode{
{
name: testInboxDir,
children: []*expectedNode{
{
name: personalDir,
children: []*expectedNode{
{
name: personalFileName2,
children: []*expectedNode{},
},
},
},
{
name: workDir,
children: []*expectedNode{
{
name: workFileName1,
children: []*expectedNode{},
},
},
},
},
},
},
),
},
{ {
name: "MovesSubtree", name: "MovesSubtree",
inputCollections: func(t *testing.T) []data.Collection { inputCollections: func(t *testing.T) []data.Collection {
@ -1919,6 +1963,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeMultipleSubdirecto
mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory), mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory),
}, },
test.inputCollections(t), test.inputCollections(t),
test.inputExcludes,
progress, progress,
) )
require.NoError(t, err) require.NoError(t, err)
@ -2079,6 +2124,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeSkipsDeletedSubtre
mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory), mockIncrementalBase("", testTenant, testUser, path.ExchangeService, path.EmailCategory),
}, },
collections, collections,
nil,
progress, progress,
) )
require.NoError(t, err) require.NoError(t, err)
@ -2325,6 +2371,7 @@ func (suite *HierarchyBuilderUnitSuite) TestBuildDirectoryTreeSelectsCorrectSubt
mockIncrementalBase("id2", testTenant, testUser, path.ExchangeService, path.EmailCategory), mockIncrementalBase("id2", testTenant, testUser, path.ExchangeService, path.EmailCategory),
}, },
collections, collections,
nil,
progress, progress,
) )
require.NoError(t, err) require.NoError(t, err)

View File

@ -129,7 +129,11 @@ func (w Wrapper) BackupCollections(
ctx, end := D.Span(ctx, "kopia:backupCollections") ctx, end := D.Span(ctx, "kopia:backupCollections")
defer end() defer end()
if len(collections) == 0 { // TODO(ashmrtn): Make this a parameter when actually enabling the global
// exclude set.
var globalExcludeSet map[string]struct{}
if len(collections) == 0 && len(globalExcludeSet) == 0 {
return &BackupStats{}, &details.Builder{}, nil, nil return &BackupStats{}, &details.Builder{}, nil, nil
} }
@ -147,7 +151,14 @@ func (w Wrapper) BackupCollections(
base = previousSnapshots base = previousSnapshots
} }
dirTree, err := inflateDirTree(ctx, w.c, base, collections, progress) dirTree, err := inflateDirTree(
ctx,
w.c,
base,
collections,
globalExcludeSet,
progress,
)
if err != nil { if err != nil {
return nil, nil, nil, errors.Wrap(err, "building kopia directories") return nil, nil, nil, errors.Wrap(err, "building kopia directories")
} }