package onedrive import ( "context" "encoding/json" "fmt" "io" "net/http" "strings" "github.com/microsoftgraph/msgraph-sdk-go/models" "github.com/pkg/errors" "golang.org/x/exp/maps" "github.com/alcionai/corso/src/internal/connector/graph" "github.com/alcionai/corso/src/internal/connector/support" "github.com/alcionai/corso/src/internal/data" "github.com/alcionai/corso/src/internal/observe" "github.com/alcionai/corso/src/pkg/control" "github.com/alcionai/corso/src/pkg/logger" "github.com/alcionai/corso/src/pkg/path" ) type driveSource int const ( unknownDriveSource driveSource = iota OneDriveSource SharePointSource ) const restrictedDirectory = "Site Pages" func (ds driveSource) toPathServiceCat() (path.ServiceType, path.CategoryType) { switch ds { case OneDriveSource: return path.OneDriveService, path.FilesCategory case SharePointSource: return path.SharePointService, path.LibrariesCategory default: return path.UnknownService, path.UnknownCategory } } type folderMatcher interface { IsAny() bool Matches(string) bool } // Collections is used to retrieve drive data for a // resource owner, which can be either a user or a sharepoint site. type Collections struct { // configured to handle large item downloads itemClient *http.Client tenant string resourceOwner string source driveSource matcher folderMatcher service graph.Servicer statusUpdater support.StatusUpdater ctrl control.Options // collectionMap allows lookup of the data.Collection // for a OneDrive folder CollectionMap map[string]data.Collection // Track stats from drive enumeration. Represents the items backed up. NumItems int NumFiles int NumContainers int } func NewCollections( itemClient *http.Client, tenant string, resourceOwner string, source driveSource, matcher folderMatcher, service graph.Servicer, statusUpdater support.StatusUpdater, ctrlOpts control.Options, ) *Collections { return &Collections{ itemClient: itemClient, tenant: tenant, resourceOwner: resourceOwner, source: source, matcher: matcher, CollectionMap: map[string]data.Collection{}, service: service, statusUpdater: statusUpdater, ctrl: ctrlOpts, } } func deserializeMetadata( ctx context.Context, cols []data.Collection, ) (map[string]string, map[string]map[string]string, error) { logger.Ctx(ctx).Infow( "deserialzing previous backup metadata", "num_collections", len(cols), ) prevDeltas := map[string]string{} prevFolders := map[string]map[string]string{} for _, col := range cols { items := col.Items() for breakLoop := false; !breakLoop; { select { case <-ctx.Done(): return nil, nil, errors.Wrap(ctx.Err(), "deserialzing previous backup metadata") case item, ok := <-items: if !ok { // End of collection items. breakLoop = true break } var err error switch item.UUID() { case graph.PreviousPathFileName: err = deserializeMap(item.ToReader(), prevFolders) case graph.DeltaURLsFileName: err = deserializeMap(item.ToReader(), prevDeltas) default: logger.Ctx(ctx).Infow( "skipping unknown metadata file", "file_name", item.UUID(), ) continue } if err == nil { // Successful decode. continue } // This is conservative, but report an error if any of the items for // any of the deserialized maps have duplicate drive IDs. This will // cause the entire backup to fail, but it's not clear if higher // layers would have caught this. Worst case if we don't handle this // we end up in a situation where we're sourcing items from the wrong // base in kopia wrapper. if errors.Is(err, errExistingMapping) { return nil, nil, errors.Wrapf( err, "deserializing metadata file %s", item.UUID(), ) } logger.Ctx(ctx).Errorw( "deserializing base backup metadata. Falling back to full backup for selected drives", "error", err, "file_name", item.UUID(), ) } } // Go through and remove partial results (i.e. path mapping but no delta URL // or vice-versa). for k := range prevDeltas { if _, ok := prevFolders[k]; !ok { delete(prevDeltas, k) } } for k := range prevFolders { if _, ok := prevDeltas[k]; !ok { delete(prevFolders, k) } } } return prevDeltas, prevFolders, nil } var errExistingMapping = errors.New("mapping already exists for same drive ID") // deserializeMap takes an reader and a map of already deserialized items and // adds the newly deserialized items to alreadyFound. Items are only added to // alreadyFound if none of the keys in the freshly deserialized map already // exist in alreadyFound. reader is closed at the end of this function. func deserializeMap[T any](reader io.ReadCloser, alreadyFound map[string]T) error { defer reader.Close() tmp := map[string]T{} err := json.NewDecoder(reader).Decode(&tmp) if err != nil { return errors.Wrap(err, "deserializing file contents") } var duplicate bool for k := range tmp { if _, ok := alreadyFound[k]; ok { duplicate = true break } } if duplicate { return errors.WithStack(errExistingMapping) } maps.Copy(alreadyFound, tmp) return nil } // Retrieves drive data as set of `data.Collections` and a set of item names to // be excluded from the upcoming backup. func (c *Collections) Get( ctx context.Context, prevMetadata []data.Collection, ) ([]data.Collection, map[string]struct{}, error) { _, _, err := deserializeMetadata(ctx, prevMetadata) if err != nil { return nil, nil, err } // Enumerate drives for the specified resourceOwner pager, err := PagerForSource(c.source, c.service, c.resourceOwner, nil) if err != nil { return nil, nil, err } retry := c.source == OneDriveSource drives, err := drives(ctx, pager, retry) if err != nil { return nil, nil, err } var ( // Drive ID -> delta URL for drive deltaURLs = map[string]string{} // Drive ID -> folder ID -> folder path folderPaths = map[string]map[string]string{} // Items that should be excluded when sourcing data from the base backup. // TODO(ashmrtn): This list contains the M365 IDs of deleted items so while // it's technically safe to pass all the way through to kopia (files are // unlikely to be named their M365 ID) we should wait to do that until we've // switched to using those IDs for file names in kopia. excludedItems = map[string]struct{}{} ) // Update the collection map with items from each drive for _, d := range drives { driveID := *d.GetId() driveName := *d.GetName() delta, paths, excluded, err := collectItems( ctx, c.service, driveID, driveName, c.UpdateCollections, ) if err != nil { return nil, nil, err } if len(delta) > 0 { deltaURLs[driveID] = delta } if len(paths) > 0 { folderPaths[driveID] = map[string]string{} for id, p := range paths { folderPaths[driveID][id] = p } } maps.Copy(excludedItems, excluded) } observe.Message(ctx, observe.Safe(fmt.Sprintf("Discovered %d items to backup", c.NumItems))) // Add an extra for the metadata collection. collections := make([]data.Collection, 0, len(c.CollectionMap)+1) for _, coll := range c.CollectionMap { collections = append(collections, coll) } service, category := c.source.toPathServiceCat() metadata, err := graph.MakeMetadataCollection( c.tenant, c.resourceOwner, service, category, []graph.MetadataCollectionEntry{ graph.NewMetadataEntry(graph.PreviousPathFileName, folderPaths), graph.NewMetadataEntry(graph.DeltaURLsFileName, deltaURLs), }, c.statusUpdater, ) if err != nil { // Technically it's safe to continue here because the logic for starting an // incremental backup should eventually find that the metadata files are // empty/missing and default to a full backup. logger.Ctx(ctx).Warnw( "making metadata collection for future incremental backups", "error", err, ) } else { collections = append(collections, metadata) } // TODO(ashmrtn): Track and return the set of items to exclude. return collections, nil, nil } // UpdateCollections initializes and adds the provided drive items to Collections // A new collection is created for every drive folder (or package). // oldPaths is the unchanged data that was loaded from the metadata file. // newPaths starts as a copy of oldPaths and is updated as changes are found in // the returned results. func (c *Collections) UpdateCollections( ctx context.Context, driveID, driveName string, items []models.DriveItemable, oldPaths map[string]string, newPaths map[string]string, excluded map[string]struct{}, ) error { for _, item := range items { if item.GetRoot() != nil { // Skip the root item continue } if item.GetParentReference() == nil || item.GetParentReference().GetPath() == nil { return errors.Errorf("item does not have a parent reference. item name : %s", *item.GetName()) } // Create a collection for the parent of this item collectionPath, err := GetCanonicalPath( *item.GetParentReference().GetPath(), c.tenant, c.resourceOwner, c.source, ) if err != nil { return err } // Skip items that don't match the folder selectors we were given. if shouldSkipDrive(ctx, collectionPath, c.matcher, driveName) { logger.Ctx(ctx).Infof("Skipping path %s", collectionPath.String()) continue } switch { case item.GetFolder() != nil, item.GetPackage() != nil: if item.GetDeleted() != nil { // Nested folders also return deleted delta results so we don't have to // worry about doing a prefix search in the map to remove the subtree of // the deleted folder/package. delete(newPaths, *item.GetId()) // TODO(ashmrtn): Create a collection with state Deleted. break } // Deletions of folders are handled in this case so we may as well start // off by saving the path.Path of the item instead of just the OneDrive // parentRef or such. folderPath, err := collectionPath.Append(*item.GetName(), false) if err != nil { logger.Ctx(ctx).Errorw("failed building collection path", "error", err) return err } // Moved folders don't cause delta results for any subfolders nested in // them. We need to go through and update paths to handle that. We only // update newPaths so we don't accidentally clobber previous deletes. // // TODO(ashmrtn): Since we're also getting notifications about folder // moves we may need to handle updates to a path of a collection we've // already created and partially populated. updatePath(newPaths, *item.GetId(), folderPath.String()) case item.GetFile() != nil: if item.GetDeleted() != nil { excluded[*item.GetId()] = struct{}{} // Exchange counts items streamed through it which includes deletions so // add that here too. c.NumFiles++ c.NumItems++ continue } // TODO(ashmrtn): Figure what when an item was moved (maybe) and add it to // the exclude list. col, found := c.CollectionMap[collectionPath.String()] if !found { // TODO(ashmrtn): Compare old and new path and set collection state // accordingly. col = NewCollection( c.itemClient, collectionPath, driveID, c.service, c.statusUpdater, c.source, c.ctrl) c.CollectionMap[collectionPath.String()] = col c.NumContainers++ c.NumItems++ } collection := col.(*Collection) collection.Add(item) c.NumFiles++ c.NumItems++ default: return errors.Errorf("item type not supported. item name : %s", *item.GetName()) } } return nil } func shouldSkipDrive(ctx context.Context, drivePath path.Path, m folderMatcher, driveName string) bool { return !includePath(ctx, m, drivePath) || (drivePath.Category() == path.LibrariesCategory && restrictedDirectory == driveName) } // GetCanonicalPath constructs the standard path for the given source. func GetCanonicalPath(p, tenant, resourceOwner string, source driveSource) (path.Path, error) { var ( pathBuilder = path.Builder{}.Append(strings.Split(p, "/")...) result path.Path err error ) switch source { case OneDriveSource: result, err = pathBuilder.ToDataLayerOneDrivePath(tenant, resourceOwner, false) case SharePointSource: result, err = pathBuilder.ToDataLayerSharePointPath(tenant, resourceOwner, path.LibrariesCategory, false) default: return nil, errors.Errorf("unrecognized drive data source") } if err != nil { return nil, errors.Wrap(err, "converting to canonical path") } return result, nil } func includePath(ctx context.Context, m folderMatcher, folderPath path.Path) bool { // Check if the folder is allowed by the scope. folderPathString, err := path.GetDriveFolderPath(folderPath) if err != nil { logger.Ctx(ctx).Error(err) return true } // Hack for the edge case where we're looking at the root folder and can // select any folder. Right now the root folder has an empty folder path. if len(folderPathString) == 0 && m.IsAny() { return true } return m.Matches(folderPathString) } func updatePath(paths map[string]string, id, newPath string) { oldPath := paths[id] if len(oldPath) == 0 { paths[id] = newPath return } if oldPath == newPath { return } // We need to do a prefix search on the rest of the map to update the subtree. // We don't need to make collections for all of these, as hierarchy merging in // other components should take care of that. We do need to ensure that the // resulting map contains all folders though so we know the next time around. for folderID, p := range paths { if !strings.HasPrefix(p, oldPath) { continue } paths[folderID] = strings.Replace(p, oldPath, newPath, 1) } }