ashmrtn 387f8e8cd7
Deserialize OneDrive metadata during backup (#2263)
## Description

Create helper functions to deserialize OneDrive metadata during subsequent backups. Currently deserialized data is not passed to the function that generates Collections nor is metadata passed in even though it's wired through GraphConnector

Additional changes to BackupOp and operations/manifests.go are required to begin passing in metadata

## Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No 

## Type of change

- [x] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Test
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

## Issue(s)

* closes #2122

## Test Plan

- [x] 💪 Manual
- [ ]  Unit test
- [ ] 💚 E2E
2023-01-31 22:48:30 +00:00

516 lines
14 KiB
Go

package onedrive
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"github.com/microsoftgraph/msgraph-sdk-go/models"
"github.com/pkg/errors"
"golang.org/x/exp/maps"
"github.com/alcionai/corso/src/internal/connector/graph"
"github.com/alcionai/corso/src/internal/connector/support"
"github.com/alcionai/corso/src/internal/data"
"github.com/alcionai/corso/src/internal/observe"
"github.com/alcionai/corso/src/pkg/control"
"github.com/alcionai/corso/src/pkg/logger"
"github.com/alcionai/corso/src/pkg/path"
)
type driveSource int
const (
unknownDriveSource driveSource = iota
OneDriveSource
SharePointSource
)
const restrictedDirectory = "Site Pages"
func (ds driveSource) toPathServiceCat() (path.ServiceType, path.CategoryType) {
switch ds {
case OneDriveSource:
return path.OneDriveService, path.FilesCategory
case SharePointSource:
return path.SharePointService, path.LibrariesCategory
default:
return path.UnknownService, path.UnknownCategory
}
}
type folderMatcher interface {
IsAny() bool
Matches(string) bool
}
// Collections is used to retrieve drive data for a
// resource owner, which can be either a user or a sharepoint site.
type Collections struct {
// configured to handle large item downloads
itemClient *http.Client
tenant string
resourceOwner string
source driveSource
matcher folderMatcher
service graph.Servicer
statusUpdater support.StatusUpdater
ctrl control.Options
// collectionMap allows lookup of the data.Collection
// for a OneDrive folder
CollectionMap map[string]data.Collection
// Track stats from drive enumeration. Represents the items backed up.
NumItems int
NumFiles int
NumContainers int
}
func NewCollections(
itemClient *http.Client,
tenant string,
resourceOwner string,
source driveSource,
matcher folderMatcher,
service graph.Servicer,
statusUpdater support.StatusUpdater,
ctrlOpts control.Options,
) *Collections {
return &Collections{
itemClient: itemClient,
tenant: tenant,
resourceOwner: resourceOwner,
source: source,
matcher: matcher,
CollectionMap: map[string]data.Collection{},
service: service,
statusUpdater: statusUpdater,
ctrl: ctrlOpts,
}
}
func deserializeMetadata(
ctx context.Context,
cols []data.Collection,
) (map[string]string, map[string]map[string]string, error) {
logger.Ctx(ctx).Infow(
"deserialzing previous backup metadata",
"num_collections",
len(cols),
)
prevDeltas := map[string]string{}
prevFolders := map[string]map[string]string{}
for _, col := range cols {
items := col.Items()
for breakLoop := false; !breakLoop; {
select {
case <-ctx.Done():
return nil, nil, errors.Wrap(ctx.Err(), "deserialzing previous backup metadata")
case item, ok := <-items:
if !ok {
// End of collection items.
breakLoop = true
break
}
var err error
switch item.UUID() {
case graph.PreviousPathFileName:
err = deserializeMap(item.ToReader(), prevFolders)
case graph.DeltaURLsFileName:
err = deserializeMap(item.ToReader(), prevDeltas)
default:
logger.Ctx(ctx).Infow(
"skipping unknown metadata file",
"file_name",
item.UUID(),
)
continue
}
if err == nil {
// Successful decode.
continue
}
// This is conservative, but report an error if any of the items for
// any of the deserialized maps have duplicate drive IDs. This will
// cause the entire backup to fail, but it's not clear if higher
// layers would have caught this. Worst case if we don't handle this
// we end up in a situation where we're sourcing items from the wrong
// base in kopia wrapper.
if errors.Is(err, errExistingMapping) {
return nil, nil, errors.Wrapf(
err,
"deserializing metadata file %s",
item.UUID(),
)
}
logger.Ctx(ctx).Errorw(
"deserializing base backup metadata. Falling back to full backup for selected drives",
"error",
err,
"file_name",
item.UUID(),
)
}
}
// Go through and remove partial results (i.e. path mapping but no delta URL
// or vice-versa).
for k := range prevDeltas {
if _, ok := prevFolders[k]; !ok {
delete(prevDeltas, k)
}
}
for k := range prevFolders {
if _, ok := prevDeltas[k]; !ok {
delete(prevFolders, k)
}
}
}
return prevDeltas, prevFolders, nil
}
var errExistingMapping = errors.New("mapping already exists for same drive ID")
// deserializeMap takes an reader and a map of already deserialized items and
// adds the newly deserialized items to alreadyFound. Items are only added to
// alreadyFound if none of the keys in the freshly deserialized map already
// exist in alreadyFound. reader is closed at the end of this function.
func deserializeMap[T any](reader io.ReadCloser, alreadyFound map[string]T) error {
defer reader.Close()
tmp := map[string]T{}
err := json.NewDecoder(reader).Decode(&tmp)
if err != nil {
return errors.Wrap(err, "deserializing file contents")
}
var duplicate bool
for k := range tmp {
if _, ok := alreadyFound[k]; ok {
duplicate = true
break
}
}
if duplicate {
return errors.WithStack(errExistingMapping)
}
maps.Copy(alreadyFound, tmp)
return nil
}
// Retrieves drive data as set of `data.Collections` and a set of item names to
// be excluded from the upcoming backup.
func (c *Collections) Get(
ctx context.Context,
prevMetadata []data.Collection,
) ([]data.Collection, map[string]struct{}, error) {
_, _, err := deserializeMetadata(ctx, prevMetadata)
if err != nil {
return nil, nil, err
}
// Enumerate drives for the specified resourceOwner
pager, err := PagerForSource(c.source, c.service, c.resourceOwner, nil)
if err != nil {
return nil, nil, err
}
retry := c.source == OneDriveSource
drives, err := drives(ctx, pager, retry)
if err != nil {
return nil, nil, err
}
var (
// Drive ID -> delta URL for drive
deltaURLs = map[string]string{}
// Drive ID -> folder ID -> folder path
folderPaths = map[string]map[string]string{}
// Items that should be excluded when sourcing data from the base backup.
// TODO(ashmrtn): This list contains the M365 IDs of deleted items so while
// it's technically safe to pass all the way through to kopia (files are
// unlikely to be named their M365 ID) we should wait to do that until we've
// switched to using those IDs for file names in kopia.
excludedItems = map[string]struct{}{}
)
// Update the collection map with items from each drive
for _, d := range drives {
driveID := *d.GetId()
driveName := *d.GetName()
delta, paths, excluded, err := collectItems(
ctx,
c.service,
driveID,
driveName,
c.UpdateCollections,
)
if err != nil {
return nil, nil, err
}
if len(delta) > 0 {
deltaURLs[driveID] = delta
}
if len(paths) > 0 {
folderPaths[driveID] = map[string]string{}
for id, p := range paths {
folderPaths[driveID][id] = p
}
}
maps.Copy(excludedItems, excluded)
}
observe.Message(ctx, observe.Safe(fmt.Sprintf("Discovered %d items to backup", c.NumItems)))
// Add an extra for the metadata collection.
collections := make([]data.Collection, 0, len(c.CollectionMap)+1)
for _, coll := range c.CollectionMap {
collections = append(collections, coll)
}
service, category := c.source.toPathServiceCat()
metadata, err := graph.MakeMetadataCollection(
c.tenant,
c.resourceOwner,
service,
category,
[]graph.MetadataCollectionEntry{
graph.NewMetadataEntry(graph.PreviousPathFileName, folderPaths),
graph.NewMetadataEntry(graph.DeltaURLsFileName, deltaURLs),
},
c.statusUpdater,
)
if err != nil {
// Technically it's safe to continue here because the logic for starting an
// incremental backup should eventually find that the metadata files are
// empty/missing and default to a full backup.
logger.Ctx(ctx).Warnw(
"making metadata collection for future incremental backups",
"error",
err,
)
} else {
collections = append(collections, metadata)
}
// TODO(ashmrtn): Track and return the set of items to exclude.
return collections, nil, nil
}
// UpdateCollections initializes and adds the provided drive items to Collections
// A new collection is created for every drive folder (or package).
// oldPaths is the unchanged data that was loaded from the metadata file.
// newPaths starts as a copy of oldPaths and is updated as changes are found in
// the returned results.
func (c *Collections) UpdateCollections(
ctx context.Context,
driveID, driveName string,
items []models.DriveItemable,
oldPaths map[string]string,
newPaths map[string]string,
excluded map[string]struct{},
) error {
for _, item := range items {
if item.GetRoot() != nil {
// Skip the root item
continue
}
if item.GetParentReference() == nil || item.GetParentReference().GetPath() == nil {
return errors.Errorf("item does not have a parent reference. item name : %s", *item.GetName())
}
// Create a collection for the parent of this item
collectionPath, err := GetCanonicalPath(
*item.GetParentReference().GetPath(),
c.tenant,
c.resourceOwner,
c.source,
)
if err != nil {
return err
}
// Skip items that don't match the folder selectors we were given.
if shouldSkipDrive(ctx, collectionPath, c.matcher, driveName) {
logger.Ctx(ctx).Infof("Skipping path %s", collectionPath.String())
continue
}
switch {
case item.GetFolder() != nil, item.GetPackage() != nil:
if item.GetDeleted() != nil {
// Nested folders also return deleted delta results so we don't have to
// worry about doing a prefix search in the map to remove the subtree of
// the deleted folder/package.
delete(newPaths, *item.GetId())
// TODO(ashmrtn): Create a collection with state Deleted.
break
}
// Deletions of folders are handled in this case so we may as well start
// off by saving the path.Path of the item instead of just the OneDrive
// parentRef or such.
folderPath, err := collectionPath.Append(*item.GetName(), false)
if err != nil {
logger.Ctx(ctx).Errorw("failed building collection path", "error", err)
return err
}
// Moved folders don't cause delta results for any subfolders nested in
// them. We need to go through and update paths to handle that. We only
// update newPaths so we don't accidentally clobber previous deletes.
//
// TODO(ashmrtn): Since we're also getting notifications about folder
// moves we may need to handle updates to a path of a collection we've
// already created and partially populated.
updatePath(newPaths, *item.GetId(), folderPath.String())
case item.GetFile() != nil:
if item.GetDeleted() != nil {
excluded[*item.GetId()] = struct{}{}
// Exchange counts items streamed through it which includes deletions so
// add that here too.
c.NumFiles++
c.NumItems++
continue
}
// TODO(ashmrtn): Figure what when an item was moved (maybe) and add it to
// the exclude list.
col, found := c.CollectionMap[collectionPath.String()]
if !found {
// TODO(ashmrtn): Compare old and new path and set collection state
// accordingly.
col = NewCollection(
c.itemClient,
collectionPath,
driveID,
c.service,
c.statusUpdater,
c.source,
c.ctrl)
c.CollectionMap[collectionPath.String()] = col
c.NumContainers++
c.NumItems++
}
collection := col.(*Collection)
collection.Add(item)
c.NumFiles++
c.NumItems++
default:
return errors.Errorf("item type not supported. item name : %s", *item.GetName())
}
}
return nil
}
func shouldSkipDrive(ctx context.Context, drivePath path.Path, m folderMatcher, driveName string) bool {
return !includePath(ctx, m, drivePath) ||
(drivePath.Category() == path.LibrariesCategory && restrictedDirectory == driveName)
}
// GetCanonicalPath constructs the standard path for the given source.
func GetCanonicalPath(p, tenant, resourceOwner string, source driveSource) (path.Path, error) {
var (
pathBuilder = path.Builder{}.Append(strings.Split(p, "/")...)
result path.Path
err error
)
switch source {
case OneDriveSource:
result, err = pathBuilder.ToDataLayerOneDrivePath(tenant, resourceOwner, false)
case SharePointSource:
result, err = pathBuilder.ToDataLayerSharePointPath(tenant, resourceOwner, path.LibrariesCategory, false)
default:
return nil, errors.Errorf("unrecognized drive data source")
}
if err != nil {
return nil, errors.Wrap(err, "converting to canonical path")
}
return result, nil
}
func includePath(ctx context.Context, m folderMatcher, folderPath path.Path) bool {
// Check if the folder is allowed by the scope.
folderPathString, err := path.GetDriveFolderPath(folderPath)
if err != nil {
logger.Ctx(ctx).Error(err)
return true
}
// Hack for the edge case where we're looking at the root folder and can
// select any folder. Right now the root folder has an empty folder path.
if len(folderPathString) == 0 && m.IsAny() {
return true
}
return m.Matches(folderPathString)
}
func updatePath(paths map[string]string, id, newPath string) {
oldPath := paths[id]
if len(oldPath) == 0 {
paths[id] = newPath
return
}
if oldPath == newPath {
return
}
// We need to do a prefix search on the rest of the map to update the subtree.
// We don't need to make collections for all of these, as hierarchy merging in
// other components should take care of that. We do need to ensure that the
// resulting map contains all folders though so we know the next time around.
for folderID, p := range paths {
if !strings.HasPrefix(p, oldPath) {
continue
}
paths[folderID] = strings.Replace(p, oldPath, newPath, 1)
}
}