Exclude metadata files from the set that we merge during incrementals (#2650)

Metadata files don't need to appear in backup
details, which means they also shouldn't be
merged during incremental backups. If we
expect them to be merged then we'll fail the
item count check for the merge process

This excludes items with metadata extensions
from merging by not adding the to the list
of items to merge

Also make it so that metadata items don't
create backup details entries in the first
place

Manually tested to ensure
restore(backup(onedrive)) still works as
expected
---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No

#### Type of change

- [ ] 🌻 Feature
- [x] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Test
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

* closes #2498

#### Test Plan

- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-03-02 05:49:19 -08:00 committed by GitHub
parent e42e3f253b
commit 83fe83a367
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 392 additions and 41 deletions

View File

@ -0,0 +1,16 @@
package metadata
import (
"github.com/alcionai/corso/src/internal/connector/onedrive"
"github.com/alcionai/corso/src/pkg/path"
)
func IsMetadataFile(p path.Path) bool {
switch p.Service() {
case path.OneDriveService:
return onedrive.IsMetaFile(p.Item())
default:
return false
}
}

View File

@ -0,0 +1,155 @@
package metadata_test
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
"github.com/alcionai/corso/src/internal/connector/graph/metadata"
"github.com/alcionai/corso/src/internal/connector/onedrive"
"github.com/alcionai/corso/src/internal/tester"
"github.com/alcionai/corso/src/pkg/path"
)
type boolfAssertionFunc func(assert.TestingT, bool, string, ...interface{}) bool
type testCase struct {
service path.ServiceType
category path.CategoryType
expected boolfAssertionFunc
}
var (
tenant = "a-tenant"
user = "a-user"
notMetaSuffixes = []string{
"",
onedrive.DataFileSuffix,
}
metaSuffixes = []string{
onedrive.MetaFileSuffix,
onedrive.DirMetaFileSuffix,
}
cases = []testCase{
{
service: path.ExchangeService,
category: path.EmailCategory,
expected: assert.Falsef,
},
{
service: path.ExchangeService,
category: path.ContactsCategory,
expected: assert.Falsef,
},
{
service: path.ExchangeService,
category: path.EventsCategory,
expected: assert.Falsef,
},
{
service: path.OneDriveService,
category: path.FilesCategory,
expected: assert.Truef,
},
{
service: path.SharePointService,
category: path.LibrariesCategory,
expected: assert.Falsef,
},
{
service: path.SharePointService,
category: path.ListsCategory,
expected: assert.Falsef,
},
{
service: path.SharePointService,
category: path.PagesCategory,
expected: assert.Falsef,
},
}
)
type MetadataUnitSuite struct {
tester.Suite
}
func TestMetadataUnitSuite(t *testing.T) {
suite.Run(t, &MetadataUnitSuite{Suite: tester.NewUnitSuite(t)})
}
func (suite *MetadataUnitSuite) TestIsMetadataFile_Files_MetaSuffixes() {
for _, test := range cases {
for _, ext := range metaSuffixes {
suite.Run(fmt.Sprintf("%s %s %s", test.service, test.category, ext), func() {
t := suite.T()
p, err := path.Builder{}.
Append("file"+ext).
ToDataLayerPath(
tenant,
user,
test.service,
test.category,
true,
)
require.NoError(t, err)
test.expected(t, metadata.IsMetadataFile(p), "extension %s", ext)
})
}
}
}
func (suite *MetadataUnitSuite) TestIsMetadataFile_Files_NotMetaSuffixes() {
for _, test := range cases {
for _, ext := range notMetaSuffixes {
suite.Run(fmt.Sprintf("%s %s %s", test.service, test.category, ext), func() {
t := suite.T()
p, err := path.Builder{}.
Append("file"+ext).
ToDataLayerPath(
tenant,
user,
test.service,
test.category,
true,
)
require.NoError(t, err)
assert.Falsef(t, metadata.IsMetadataFile(p), "extension %s", ext)
})
}
}
}
func (suite *MetadataUnitSuite) TestIsMetadataFile_Directories() {
suffixes := append(append([]string{}, notMetaSuffixes...), metaSuffixes...)
for _, test := range cases {
for _, ext := range suffixes {
suite.Run(fmt.Sprintf("%s %s %s", test.service, test.category, ext), func() {
t := suite.T()
p, err := path.Builder{}.
Append("file"+ext).
ToDataLayerPath(
tenant,
user,
test.service,
test.category,
false,
)
require.NoError(t, err)
assert.Falsef(t, metadata.IsMetadataFile(p), "extension %s", ext)
})
}
}
}

View File

@ -5,6 +5,7 @@ import (
"context"
"io"
"net/http"
"strings"
"sync"
"sync/atomic"
"time"
@ -39,11 +40,17 @@ const (
DataFileSuffix = ".data"
)
func IsMetaFile(name string) bool {
return strings.HasSuffix(name, MetaFileSuffix) || strings.HasSuffix(name, DirMetaFileSuffix)
}
var (
_ data.BackupCollection = &Collection{}
_ data.Stream = &Item{}
_ data.StreamInfo = &Item{}
_ data.StreamModTime = &Item{}
_ data.Stream = &metadataItem{}
_ data.StreamModTime = &metadataItem{}
)
// Collection represents a set of OneDrive objects retrieved from M365
@ -213,9 +220,6 @@ type Item struct {
id string
data io.ReadCloser
info details.ItemInfo
// true if the item was marked by graph as deleted.
deleted bool
}
func (od *Item) UUID() string {
@ -226,9 +230,11 @@ func (od *Item) ToReader() io.ReadCloser {
return od.data
}
// TODO(ashmrtn): Fill in once delta tokens return deleted items.
// Deleted implements an interface function. However, OneDrive items are marked
// as deleted by adding them to the exclude list so this can always return
// false.
func (od Item) Deleted() bool {
return od.deleted
return false
}
func (od *Item) Info() details.ItemInfo {
@ -239,6 +245,31 @@ func (od *Item) ModTime() time.Time {
return od.info.Modified()
}
type metadataItem struct {
id string
data io.ReadCloser
modTime time.Time
}
func (od *metadataItem) UUID() string {
return od.id
}
func (od *metadataItem) ToReader() io.ReadCloser {
return od.data
}
// Deleted implements an interface function. However, OneDrive items are marked
// as deleted by adding them to the exclude list so this can always return
// false.
func (od metadataItem) Deleted() bool {
return false
}
func (od *metadataItem) ModTime() time.Time {
return od.modTime
}
// populateItems iterates through items added to the collection
// and uses the collection `itemReader` to read the item
func (oc *Collection) populateItems(ctx context.Context, errs *fault.Bus) {
@ -412,28 +443,10 @@ func (oc *Collection) populateItems(ctx context.Context, errs *fault.Bus) {
return progReader, nil
})
// TODO(meain): Remove this once we change to always
// backing up permissions. Until then we cannot rely
// on whether the previous data is what we need as the
// user might have not backup up permissions in the
// previous run.
metaItemInfo := details.ItemInfo{}
metaItemInfo.OneDrive = &details.OneDriveInfo{
Created: itemInfo.OneDrive.Created,
ItemName: itemInfo.OneDrive.ItemName,
DriveName: itemInfo.OneDrive.DriveName,
ItemType: itemInfo.OneDrive.ItemType,
IsMeta: true,
Modified: time.Now(), // set to current time to always refresh
Owner: itemInfo.OneDrive.Owner,
ParentPath: itemInfo.OneDrive.ParentPath,
Size: itemInfo.OneDrive.Size,
}
oc.data <- &Item{
id: itemName + metaSuffix,
data: metaReader,
info: metaItemInfo,
oc.data <- &metadataItem{
id: itemName + metaSuffix,
data: metaReader,
modTime: time.Now(),
}
}

View File

@ -22,6 +22,7 @@ import (
"github.com/kopia/kopia/snapshot/snapshotfs"
"github.com/pkg/errors"
"github.com/alcionai/corso/src/internal/connector/graph/metadata"
"github.com/alcionai/corso/src/internal/data"
D "github.com/alcionai/corso/src/internal/diagnostics"
"github.com/alcionai/corso/src/pkg/backup/details"
@ -448,16 +449,25 @@ func streamBaseEntries(
return errors.Wrap(err, "getting previous full item path for base entry")
}
// All items have item info in the base backup. However, we need to make
// sure we have enough metadata to find those entries. To do that we add the
// item to progress and having progress aggregate everything for later.
d := &itemDetails{
info: nil,
repoPath: itemPath,
prevPath: prevItemPath,
locationPath: locationPath,
// Meta files aren't in backup details since it's the set of items the user
// sees.
//
// TODO(ashmrtn): We may eventually want to make this a function that is
// passed in so that we can more easily switch it between different external
// service provider implementations.
if !metadata.IsMetadataFile(itemPath) {
// All items have item info in the base backup. However, we need to make
// sure we have enough metadata to find those entries. To do that we add
// the item to progress and having progress aggregate everything for
// later.
d := &itemDetails{
info: nil,
repoPath: itemPath,
prevPath: prevItemPath,
locationPath: locationPath,
}
progress.put(encodeAsPath(itemPath.PopFront().Elements()...), d)
}
progress.put(encodeAsPath(itemPath.PopFront().Elements()...), d)
if err := cb(ctx, entry); err != nil {
return errors.Wrapf(err, "executing callback on item %q", itemPath)

View File

@ -17,6 +17,7 @@ import (
"golang.org/x/exp/maps"
"github.com/alcionai/corso/src/internal/connector/mockconnector"
"github.com/alcionai/corso/src/internal/connector/onedrive"
"github.com/alcionai/corso/src/internal/data"
"github.com/alcionai/corso/src/internal/tester"
"github.com/alcionai/corso/src/pkg/fault"
@ -324,6 +325,166 @@ func (suite *KopiaIntegrationSuite) TestBackupCollections() {
}
}
func (suite *KopiaIntegrationSuite) TestBackupCollections_NoDetailsForMeta() {
tmp, err := path.Builder{}.Append(testInboxDir).ToDataLayerPath(
testTenant,
testUser,
path.OneDriveService,
path.FilesCategory,
false)
require.NoError(suite.T(), err)
storePath := tmp
locPath := tmp
// tags that are supplied by the caller. This includes basic tags to support
// lookups and extra tags the caller may want to apply.
tags := map[string]string{
"fnords": "smarf",
"brunhilda": "",
}
reasons := []Reason{
{
ResourceOwner: storePath.ResourceOwner(),
Service: storePath.Service(),
Category: storePath.Category(),
},
}
for _, r := range reasons {
for _, k := range r.TagKeys() {
tags[k] = ""
}
}
expectedTags := map[string]string{}
maps.Copy(expectedTags, normalizeTagKVs(tags))
table := []struct {
name string
expectedUploadedFiles int
expectedCachedFiles int
numDeetsEntries int
hasMetaDeets bool
cols func() []data.BackupCollection
}{
{
name: "Uncached",
expectedUploadedFiles: 3,
expectedCachedFiles: 0,
// MockStream implements item info even though OneDrive doesn't.
numDeetsEntries: 3,
hasMetaDeets: true,
cols: func() []data.BackupCollection {
mc := mockconnector.NewMockExchangeCollection(
storePath,
locPath,
3)
mc.Names[0] = testFileName
mc.Names[1] = testFileName + onedrive.MetaFileSuffix
mc.Names[2] = storePath.Folders()[0] + onedrive.DirMetaFileSuffix
return []data.BackupCollection{mc}
},
},
{
name: "Cached",
expectedUploadedFiles: 1,
expectedCachedFiles: 2,
// Meta entries are filtered out.
numDeetsEntries: 1,
hasMetaDeets: false,
cols: func() []data.BackupCollection {
mc := mockconnector.NewMockExchangeCollection(
storePath,
locPath,
1)
mc.Names[0] = testFileName
mc.ColState = data.NotMovedState
return []data.BackupCollection{mc}
},
},
}
prevSnaps := []IncrementalBase{}
for _, test := range table {
suite.Run(test.name, func() {
t := suite.T()
collections := test.cols()
stats, deets, prevShortRefs, err := suite.w.BackupCollections(
suite.ctx,
prevSnaps,
collections,
nil,
tags,
true,
fault.New(true))
assert.NoError(t, err)
assert.Equal(t, test.expectedUploadedFiles, stats.TotalFileCount, "total files")
assert.Equal(t, test.expectedUploadedFiles, stats.UncachedFileCount, "uncached files")
assert.Equal(t, test.expectedCachedFiles, stats.CachedFileCount, "cached files")
assert.Equal(t, 5, stats.TotalDirectoryCount)
assert.Equal(t, 0, stats.IgnoredErrorCount)
assert.Equal(t, 0, stats.ErrorCount)
assert.False(t, stats.Incomplete)
// 47 file and 5 folder entries.
details := deets.Details().Entries
assert.Len(
t,
details,
test.numDeetsEntries+5,
)
for _, entry := range details {
assert.True(t, entry.Updated)
if test.hasMetaDeets {
continue
}
assert.False(t, onedrive.IsMetaFile(entry.RepoRef), "metadata entry in details")
}
assert.Len(t, prevShortRefs, 0)
for _, prevRef := range prevShortRefs {
assert.False(
t,
onedrive.IsMetaFile(prevRef.Repo.String()),
"metadata entry in base details")
}
checkSnapshotTags(
t,
suite.ctx,
suite.w.c,
expectedTags,
stats.SnapshotID,
)
snap, err := snapshot.LoadSnapshot(
suite.ctx,
suite.w.c,
manifest.ID(stats.SnapshotID),
)
require.NoError(t, err)
prevSnaps = append(prevSnaps, IncrementalBase{
Manifest: snap,
SubtreePaths: []*path.Builder{
storePath.ToBuilder().Dir(),
},
})
})
}
}
func (suite *KopiaIntegrationSuite) TestRestoreAfterCompressionChange() {
t := suite.T()
ctx, flush := tester.NewContext()

View File

@ -2,7 +2,6 @@ package repository
import (
"context"
"strings"
"time"
"github.com/alcionai/clues"
@ -367,10 +366,7 @@ func (r repository) BackupDetails(
if b.Version >= version.OneDrive1DataAndMetaFiles && b.Version < version.OneDrive3IsMetaMarker {
for _, d := range deets.Entries {
if d.OneDrive != nil {
if strings.HasSuffix(d.RepoRef, onedrive.MetaFileSuffix) ||
strings.HasSuffix(d.RepoRef, onedrive.DirMetaFileSuffix) {
d.OneDrive.IsMeta = true
}
d.OneDrive.IsMeta = onedrive.IsMetaFile(d.RepoRef)
}
}
}