Deserialize OneDrive metadata during backup (#2263)

## Description

Create helper functions to deserialize OneDrive metadata during subsequent backups. Currently deserialized data is not passed to the function that generates Collections nor is metadata passed in even though it's wired through GraphConnector

Additional changes to BackupOp and operations/manifests.go are required to begin passing in metadata

## Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No 

## Type of change

- [x] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Test
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

## Issue(s)

* closes #2122

## Test Plan

- [x] 💪 Manual
- [ ]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-01-31 14:48:30 -08:00 committed by GitHub
parent 070b8fddee
commit 387f8e8cd7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 444 additions and 9 deletions

View File

@ -83,7 +83,7 @@ func (gc *GraphConnector) DataCollections(
return colls, excludes, nil
case selectors.ServiceOneDrive:
return gc.OneDriveDataCollections(ctx, sels, ctrlOpts)
return gc.OneDriveDataCollections(ctx, sels, metadata, ctrlOpts)
case selectors.ServiceSharePoint:
colls, excludes, err := sharepoint.DataCollections(
@ -182,6 +182,7 @@ func (fm odFolderMatcher) Matches(dir string) bool {
func (gc *GraphConnector) OneDriveDataCollections(
ctx context.Context,
selector selectors.Selector,
metadata []data.Collection,
ctrlOpts control.Options,
) ([]data.Collection, map[string]struct{}, error) {
odb, err := selector.ToOneDriveBackup()
@ -209,7 +210,7 @@ func (gc *GraphConnector) OneDriveDataCollections(
gc.Service,
gc.UpdateStatus,
ctrlOpts,
).Get(ctx)
).Get(ctx, metadata)
if err != nil {
return nil, nil, support.WrapAndAppend(user, err, errs)
}

View File

@ -2,7 +2,9 @@ package onedrive
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
@ -92,9 +94,145 @@ func NewCollections(
}
}
func deserializeMetadata(
ctx context.Context,
cols []data.Collection,
) (map[string]string, map[string]map[string]string, error) {
logger.Ctx(ctx).Infow(
"deserialzing previous backup metadata",
"num_collections",
len(cols),
)
prevDeltas := map[string]string{}
prevFolders := map[string]map[string]string{}
for _, col := range cols {
items := col.Items()
for breakLoop := false; !breakLoop; {
select {
case <-ctx.Done():
return nil, nil, errors.Wrap(ctx.Err(), "deserialzing previous backup metadata")
case item, ok := <-items:
if !ok {
// End of collection items.
breakLoop = true
break
}
var err error
switch item.UUID() {
case graph.PreviousPathFileName:
err = deserializeMap(item.ToReader(), prevFolders)
case graph.DeltaURLsFileName:
err = deserializeMap(item.ToReader(), prevDeltas)
default:
logger.Ctx(ctx).Infow(
"skipping unknown metadata file",
"file_name",
item.UUID(),
)
continue
}
if err == nil {
// Successful decode.
continue
}
// This is conservative, but report an error if any of the items for
// any of the deserialized maps have duplicate drive IDs. This will
// cause the entire backup to fail, but it's not clear if higher
// layers would have caught this. Worst case if we don't handle this
// we end up in a situation where we're sourcing items from the wrong
// base in kopia wrapper.
if errors.Is(err, errExistingMapping) {
return nil, nil, errors.Wrapf(
err,
"deserializing metadata file %s",
item.UUID(),
)
}
logger.Ctx(ctx).Errorw(
"deserializing base backup metadata. Falling back to full backup for selected drives",
"error",
err,
"file_name",
item.UUID(),
)
}
}
// Go through and remove partial results (i.e. path mapping but no delta URL
// or vice-versa).
for k := range prevDeltas {
if _, ok := prevFolders[k]; !ok {
delete(prevDeltas, k)
}
}
for k := range prevFolders {
if _, ok := prevDeltas[k]; !ok {
delete(prevFolders, k)
}
}
}
return prevDeltas, prevFolders, nil
}
var errExistingMapping = errors.New("mapping already exists for same drive ID")
// deserializeMap takes an reader and a map of already deserialized items and
// adds the newly deserialized items to alreadyFound. Items are only added to
// alreadyFound if none of the keys in the freshly deserialized map already
// exist in alreadyFound. reader is closed at the end of this function.
func deserializeMap[T any](reader io.ReadCloser, alreadyFound map[string]T) error {
defer reader.Close()
tmp := map[string]T{}
err := json.NewDecoder(reader).Decode(&tmp)
if err != nil {
return errors.Wrap(err, "deserializing file contents")
}
var duplicate bool
for k := range tmp {
if _, ok := alreadyFound[k]; ok {
duplicate = true
break
}
}
if duplicate {
return errors.WithStack(errExistingMapping)
}
maps.Copy(alreadyFound, tmp)
return nil
}
// Retrieves drive data as set of `data.Collections` and a set of item names to
// be excluded from the upcoming backup.
func (c *Collections) Get(ctx context.Context) ([]data.Collection, map[string]struct{}, error) {
func (c *Collections) Get(
ctx context.Context,
prevMetadata []data.Collection,
) ([]data.Collection, map[string]struct{}, error) {
_, _, err := deserializeMetadata(ctx, prevMetadata)
if err != nil {
return nil, nil, err
}
// Enumerate drives for the specified resourceOwner
pager, err := PagerForSource(c.source, c.service, c.resourceOwner, nil)
if err != nil {

View File

@ -11,8 +11,11 @@ import (
"golang.org/x/exp/maps"
"github.com/alcionai/corso/src/internal/connector/graph"
"github.com/alcionai/corso/src/internal/connector/support"
"github.com/alcionai/corso/src/internal/data"
"github.com/alcionai/corso/src/internal/tester"
"github.com/alcionai/corso/src/pkg/control"
"github.com/alcionai/corso/src/pkg/path"
"github.com/alcionai/corso/src/pkg/selectors"
)
@ -621,13 +624,304 @@ func (suite *OneDriveCollectionsSuite) TestUpdateCollections() {
}
}
func driveItem(id string, name string, path string, isFile, isFolder, isPackage bool) models.DriveItemable {
func (suite *OneDriveCollectionsSuite) TestDeserializeMetadata() {
tenant := "a-tenant"
user := "a-user"
driveID1 := "1"
driveID2 := "2"
deltaURL1 := "url/1"
deltaURL2 := "url/2"
folderID1 := "folder1"
folderID2 := "folder2"
path1 := "folder1/path"
path2 := "folder2/path"
table := []struct {
name string
// Each function returns the set of files for a single data.Collection.
cols []func() []graph.MetadataCollectionEntry
expectedDeltas map[string]string
expectedPaths map[string]map[string]string
errCheck assert.ErrorAssertionFunc
}{
{
name: "SuccessOneDriveAllOneCollection",
cols: []func() []graph.MetadataCollectionEntry{
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.DeltaURLsFileName,
map[string]string{driveID1: deltaURL1},
),
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]map[string]string{
driveID1: {
folderID1: path1,
},
},
),
}
},
},
expectedDeltas: map[string]string{
driveID1: deltaURL1,
},
expectedPaths: map[string]map[string]string{
driveID1: {
folderID1: path1,
},
},
errCheck: assert.NoError,
},
{
name: "MissingPaths",
cols: []func() []graph.MetadataCollectionEntry{
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.DeltaURLsFileName,
map[string]string{driveID1: deltaURL1},
),
}
},
},
expectedDeltas: map[string]string{},
expectedPaths: map[string]map[string]string{},
errCheck: assert.NoError,
},
{
name: "MissingDeltas",
cols: []func() []graph.MetadataCollectionEntry{
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]map[string]string{
driveID1: {
folderID1: path1,
},
},
),
}
},
},
expectedDeltas: map[string]string{},
expectedPaths: map[string]map[string]string{},
errCheck: assert.NoError,
},
{
name: "SuccessTwoDrivesTwoCollections",
cols: []func() []graph.MetadataCollectionEntry{
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.DeltaURLsFileName,
map[string]string{driveID1: deltaURL1},
),
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]map[string]string{
driveID1: {
folderID1: path1,
},
},
),
}
},
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.DeltaURLsFileName,
map[string]string{driveID2: deltaURL2},
),
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]map[string]string{
driveID2: {
folderID2: path2,
},
},
),
}
},
},
expectedDeltas: map[string]string{
driveID1: deltaURL1,
driveID2: deltaURL2,
},
expectedPaths: map[string]map[string]string{
driveID1: {
folderID1: path1,
},
driveID2: {
folderID2: path2,
},
},
errCheck: assert.NoError,
},
{
// Bad formats are logged but skip adding entries to the maps and don't
// return an error.
name: "BadFormat",
cols: []func() []graph.MetadataCollectionEntry{
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]string{driveID1: deltaURL1},
),
}
},
},
expectedDeltas: map[string]string{},
expectedPaths: map[string]map[string]string{},
errCheck: assert.NoError,
},
{
// Unexpected files are logged and skipped. They don't cause an error to
// be returned.
name: "BadFileName",
cols: []func() []graph.MetadataCollectionEntry{
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.DeltaURLsFileName,
map[string]string{driveID1: deltaURL1},
),
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]map[string]string{
driveID1: {
folderID1: path1,
},
},
),
graph.NewMetadataEntry(
"foo",
map[string]string{driveID1: deltaURL1},
),
}
},
},
expectedDeltas: map[string]string{
driveID1: deltaURL1,
},
expectedPaths: map[string]map[string]string{
driveID1: {
folderID1: path1,
},
},
errCheck: assert.NoError,
},
{
name: "DriveAlreadyFound_Paths",
cols: []func() []graph.MetadataCollectionEntry{
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.DeltaURLsFileName,
map[string]string{driveID1: deltaURL1},
),
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]map[string]string{
driveID1: {
folderID1: path1,
},
},
),
}
},
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]map[string]string{
driveID1: {
folderID2: path2,
},
},
),
}
},
},
expectedDeltas: nil,
expectedPaths: nil,
errCheck: assert.Error,
},
{
name: "DriveAlreadyFound_Deltas",
cols: []func() []graph.MetadataCollectionEntry{
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.DeltaURLsFileName,
map[string]string{driveID1: deltaURL1},
),
graph.NewMetadataEntry(
graph.PreviousPathFileName,
map[string]map[string]string{
driveID1: {
folderID1: path1,
},
},
),
}
},
func() []graph.MetadataCollectionEntry {
return []graph.MetadataCollectionEntry{
graph.NewMetadataEntry(
graph.DeltaURLsFileName,
map[string]string{driveID1: deltaURL2},
),
}
},
},
expectedDeltas: nil,
expectedPaths: nil,
errCheck: assert.Error,
},
}
for _, test := range table {
suite.T().Run(test.name, func(t *testing.T) {
ctx, flush := tester.NewContext()
defer flush()
cols := []data.Collection{}
for _, c := range test.cols {
mc, err := graph.MakeMetadataCollection(
tenant,
user,
path.OneDriveService,
path.FilesCategory,
c(),
func(*support.ConnectorOperationStatus) {},
)
require.NoError(t, err)
cols = append(cols, mc)
}
deltas, paths, err := deserializeMetadata(ctx, cols)
test.errCheck(t, err)
assert.Equal(t, test.expectedDeltas, deltas)
assert.Equal(t, test.expectedPaths, paths)
})
}
}
func driveItem(id string, name string, parentPath string, isFile, isFolder, isPackage bool) models.DriveItemable {
item := models.NewDriveItem()
item.SetName(&name)
item.SetId(&id)
parentReference := models.NewItemReference()
parentReference.SetPath(&path)
parentReference.SetPath(&parentPath)
item.SetParentReference(parentReference)
switch {
@ -644,13 +938,13 @@ func driveItem(id string, name string, path string, isFile, isFolder, isPackage
// delItem creates a DriveItemable that is marked as deleted. path must be set
// to the base drive path.
func delItem(id string, path string, isFile, isFolder, isPackage bool) models.DriveItemable {
func delItem(id string, parentPath string, isFile, isFolder, isPackage bool) models.DriveItemable {
item := models.NewDriveItem()
item.SetId(&id)
item.SetDeleted(models.NewDeleted())
parentReference := models.NewItemReference()
parentReference.SetPath(&path)
parentReference.SetPath(&parentPath)
item.SetParentReference(parentReference)
switch {

View File

@ -463,7 +463,7 @@ func (suite *OneDriveSuite) TestOneDriveNewCollections() {
service,
service.updateStatus,
control.Options{},
).Get(ctx)
).Get(ctx, nil)
assert.NoError(t, err)
// Don't expect excludes as this isn't an incremental backup.
assert.Empty(t, excludes)

View File

@ -152,7 +152,9 @@ func collectLibraries(
updater.UpdateStatus,
ctrlOpts)
odcs, excludes, err := colls.Get(ctx)
// TODO(ashmrtn): Pass previous backup metadata when SharePoint supports delta
// token-based incrementals.
odcs, excludes, err := colls.Get(ctx, nil)
if err != nil {
return nil, nil, support.WrapAndAppend(siteID, err, errs)
}