corso/src/internal/m365/collection/drive/collection.go

// Package drive provides support for retrieving M365 Drive objects
package drive

import (
	"context"
	"io"
	"net/http"
	"sync"
	"sync/atomic"
	"time"

	"github.com/alcionai/clues"
	"github.com/microsoftgraph/msgraph-sdk-go/models"
	"github.com/spatialcurrent/go-lazy/pkg/lazy"

	"github.com/alcionai/corso/src/internal/common/idname"
	"github.com/alcionai/corso/src/internal/common/ptr"
	"github.com/alcionai/corso/src/internal/data"
	"github.com/alcionai/corso/src/internal/m365/collection/drive/metadata"
	"github.com/alcionai/corso/src/internal/m365/graph"
	"github.com/alcionai/corso/src/internal/m365/support"
	"github.com/alcionai/corso/src/internal/observe"
	"github.com/alcionai/corso/src/pkg/backup/details"
	"github.com/alcionai/corso/src/pkg/control"
	"github.com/alcionai/corso/src/pkg/extensions"
	"github.com/alcionai/corso/src/pkg/fault"
	"github.com/alcionai/corso/src/pkg/logger"
	"github.com/alcionai/corso/src/pkg/path"
	"github.com/alcionai/corso/src/pkg/services/m365/api"
)

const (
	// Used to compare in case of OneNote files
	MaxOneNoteFileSize = 2 * 1024 * 1024 * 1024
)

var _ data.BackupCollection = &Collection{}

// Collection represents a set of OneDrive objects retrieved from M365
type Collection struct {
	handler BackupHandler

	// the protected resource represented in this collection.
	protectedResource idname.Provider

	// data is used to share data streams with the collection consumer
	data chan data.Item
	// folderPath indicates what level in the hierarchy this collection
	// represents
	folderPath path.Path
	// M365 IDs of file items within this collection
	driveItems map[string]models.DriveItemable

	// Primary M365 ID of the drive this collection was created from
	driveID   string
	driveName string

	statusUpdater support.StatusUpdater
	ctrl          control.Options

	// PrevPath is the previous hierarchical path used by this collection.
	// It may be the same as fullPath, if the folder was not renamed or
	// moved.  It will be empty on its first retrieval.
	prevPath path.Path

	// locPath represents the human-readable location of this collection.
	locPath *path.Builder
	// prevLocPath represents the human-readable location of this collection in
	// the previous backup.
	prevLocPath *path.Builder

	// Specifies if it new, moved/rename or deleted
	state data.CollectionState

	// scope specifies what scope the items in a collection belongs
	// to. This is primarily useful when dealing with a "package",
	// like in the case of a OneNote file. A OneNote file is a
	// collection with a package scope and multiple files in it. Most
	// other collections have a scope of folder to indicate that the
	// files within them belong to a folder.
	scope collectionScope

	// should only be true if the old delta token expired
	doNotMergeItems bool

	urlCache getItemPropertyer
}

func pathToLocation(p path.Path) (*path.Builder, error) {
	if p == nil {
		return nil, nil
	}

	dp, err := path.ToDrivePath(p)
	if err != nil {
		return nil, err
	}

	return path.Builder{}.Append(dp.Root).Append(dp.Folders...), nil
}

// NewCollection creates a Collection
func NewCollection(
	handler BackupHandler,
	resource idname.Provider,
	currPath path.Path,
	prevPath path.Path,
	driveID string,
	statusUpdater support.StatusUpdater,
	ctrlOpts control.Options,
	colScope collectionScope,
	doNotMergeItems bool,
	urlCache getItemPropertyer,
) (*Collection, error) {
	// TODO(ashmrtn): If OneDrive switches to using folder IDs then this will need
	// to be changed as we won't be able to extract path information from the
	// storage path. In that case, we'll need to start storing the location paths
	// like we do the previous path.
	locPath, err := pathToLocation(currPath)
	if err != nil {
		return nil, clues.Wrap(err, "getting location").With("curr_path", currPath.String())
	}

	prevLocPath, err := pathToLocation(prevPath)
	if err != nil {
		return nil, clues.Wrap(err, "getting previous location").With("prev_path", prevPath.String())
	}

	c := newColl(
		handler,
		resource,
		currPath,
		prevPath,
		driveID,
		statusUpdater,
		ctrlOpts,
		colScope,
		doNotMergeItems,
		urlCache)

	c.locPath = locPath
	c.prevLocPath = prevLocPath

	return c, nil
}

func newColl(
	handler BackupHandler,
	resource idname.Provider,
	currPath path.Path,
	prevPath path.Path,
	driveID string,
	statusUpdater support.StatusUpdater,
	ctrlOpts control.Options,
	colScope collectionScope,
	doNotMergeItems bool,
	urlCache getItemPropertyer,
) *Collection {
	c := &Collection{
		handler:           handler,
		protectedResource: resource,
		folderPath:        currPath,
		prevPath:          prevPath,
		driveItems:        map[string]models.DriveItemable{},
		driveID:           driveID,
		data:              make(chan data.Item, graph.Parallelism(path.OneDriveMetadataService).CollectionBufferSize()),
		statusUpdater:     statusUpdater,
		ctrl:              ctrlOpts,
		state:             data.StateOf(prevPath, currPath),
		scope:             colScope,
		doNotMergeItems:   doNotMergeItems,
		urlCache:          urlCache,
	}

	return c
}

// Adds an itemID to the collection.  This will make it eligible to be
// populated. The return values denotes if the item was previously
// present or is new one.
func (oc *Collection) Add(item models.DriveItemable) bool {
	_, found := oc.driveItems[ptr.Val(item.GetId())]
	oc.driveItems[ptr.Val(item.GetId())] = item

	// if !found, it's a new addition
	return !found
}

// Remove removes a item from the collection
func (oc *Collection) Remove(itemID string) bool {
	_, found := oc.driveItems[itemID]
	if !found {
		return false
	}

	delete(oc.driveItems, itemID)

	return true
}

// IsEmpty check if a collection does not contain any items
func (oc *Collection) IsEmpty() bool {
	return len(oc.driveItems) == 0
}

// Items() returns the channel containing M365 Exchange objects
func (oc *Collection) Items(
	ctx context.Context,
	errs *fault.Bus,
) <-chan data.Item {
	go oc.streamItems(ctx, errs)
	return oc.data
}

func (oc *Collection) FullPath() path.Path {
	return oc.folderPath
}

func (oc Collection) PreviousPath() path.Path {
	return oc.prevPath
}

func (oc *Collection) SetFullPath(curPath path.Path) {
	oc.folderPath = curPath
	oc.state = data.StateOf(oc.prevPath, curPath)
}

func (oc Collection) LocationPath() *path.Builder {
	return oc.locPath
}

func (oc Collection) PreviousLocationPath() details.LocationIDer {
	if oc.prevLocPath == nil {
		return nil
	}

	return oc.handler.NewLocationIDer(oc.driveID, oc.prevLocPath.Elements()...)
}

func (oc Collection) State() data.CollectionState {
	return oc.state
}

func (oc Collection) DoNotMergeItems() bool {
	return oc.doNotMergeItems
}

// getDriveItemContent fetch drive item's contents with retries
func (oc *Collection) getDriveItemContent(
	ctx context.Context,
	driveID string,
	item models.DriveItemable,
	errs *fault.Bus,
) (io.ReadCloser, error) {
	var (
		itemID   = ptr.Val(item.GetId())
		itemName = ptr.Val(item.GetName())
	)

	itemData, err := downloadContent(ctx, oc.handler, oc.urlCache, item, oc.driveID)
	if err != nil {
		if clues.HasLabel(err, graph.LabelsMalware) || (item != nil && item.GetMalware() != nil) {
			logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipMalware).Info("item flagged as malware")
			errs.AddSkip(ctx, fault.FileSkip(fault.SkipMalware, driveID, itemID, itemName, graph.ItemInfo(item)))

			return nil, clues.Wrap(err, "malware item").Label(graph.LabelsSkippable)
		}

		if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) {
			logger.CtxErr(ctx, err).Info("item not found, probably deleted in flight")
			return nil, clues.Wrap(err, "deleted item").Label(graph.LabelsSkippable)
		}

		// Skip big OneNote files as they can't be downloaded
		if clues.HasLabel(err, graph.LabelStatus(http.StatusServiceUnavailable)) &&
			// TODO: We've removed the file size check because it looks like we've seen persistent
			// 503's with smaller OneNote files also.
			// oc.scope == CollectionScopePackage && *item.GetSize() >= MaxOneNoteFileSize {
			oc.scope == CollectionScopePackage {
			// FIXME: It is possible that in case of a OneNote file we
			// will end up just backing up the `onetoc2` file without
			// the one file which is the important part of the OneNote
			// "item". This will have to be handled during the
			// restore, or we have to handle it separately by somehow
			// deleting the entire collection.
			logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipBigOneNote).Info("max OneNote file size exceeded")
			errs.AddSkip(ctx, fault.FileSkip(fault.SkipBigOneNote, driveID, itemID, itemName, graph.ItemInfo(item)))

			return nil, clues.Wrap(err, "max oneNote item").Label(graph.LabelsSkippable)
		}

		errs.AddRecoverable(
			ctx,
			clues.Wrap(err, "downloading item content").
				WithClues(ctx).
				Label(fault.LabelForceNoBackupCreation))

		// return err, not el.Err(), because the lazy reader needs to communicate to
		// the data consumer that this item is unreadable, regardless of the fault state.
		return nil, clues.Wrap(err, "fetching item content")
	}

	return itemData, nil
}

type itemAndAPIGetter interface {
	GetItemer
	api.Getter
}

// downloadContent attempts to fetch the item content.  If the content url
// is expired (ie, returns a 401), it re-fetches the item to get a new download
// url and tries again.
func downloadContent(
	ctx context.Context,
	iaag itemAndAPIGetter,
	uc getItemPropertyer,
	item models.DriveItemable,
	driveID string,
) (io.ReadCloser, error) {
	itemID := ptr.Val(item.GetId())
	ctx = clues.Add(ctx, "item_id", itemID)

	content, err := downloadItem(ctx, iaag, item)
	if err == nil {
		return content, nil
	} else if !graph.IsErrUnauthorized(err) {
		return nil, err
	}

	// Assume unauthorized requests are a sign of an expired jwt
	// token, and that we've overrun the available window to
	// download the file.  Get a fresh url from the cache and attempt to
	// download again.
	content, err = readItemContents(ctx, iaag, uc, itemID)
	if err == nil {
		logger.Ctx(ctx).Debug("found item in url cache")
		return content, nil
	}

	// Consider cache errors(including deleted items) as cache misses. This is
	// to preserve existing behavior. Fallback to refetching the item using the
	// API.
	logger.CtxErr(ctx, err).Info("url cache miss: refetching from API")

	di, err := iaag.GetItem(ctx, driveID, ptr.Val(item.GetId()))
	if err != nil {
		return nil, clues.Wrap(err, "retrieving expired item")
	}

	content, err = downloadItem(ctx, iaag, di)
	if err != nil {
		return nil, clues.Wrap(err, "content download retry")
	}

	return content, nil
}

// readItemContents fetches latest download URL from the cache and attempts to
// download the file using the new URL.
func readItemContents(
	ctx context.Context,
	iaag itemAndAPIGetter,
	uc getItemPropertyer,
	itemID string,
) (io.ReadCloser, error) {
	if uc == nil {
		return nil, clues.New("nil url cache")
	}

	props, err := uc.getItemProperties(ctx, itemID)
	if err != nil {
		return nil, err
	}

	// Handle newly deleted items
	if props.isDeleted {
		logger.Ctx(ctx).Info("item deleted in cache")
		return nil, graph.ErrDeletedInFlight
	}

	rc, err := downloadFile(ctx, iaag, props.downloadURL)
	if graph.IsErrUnauthorized(err) {
		logger.CtxErr(ctx, err).Info("stale item in cache")
	}

	if err != nil {
		return nil, err
	}

	return rc, nil
}

type driveStats struct {
	dirsRead   int64
	dirsFound  int64
	byteCount  int64
	itemsRead  int64
	itemsFound int64
}

// streamItems iterates through items added to the collection
// and uses the collection `itemReader` to read the item
func (oc *Collection) streamItems(ctx context.Context, errs *fault.Bus) {
	var (
		stats driveStats
		wg    sync.WaitGroup
	)

	// Retrieve the OneDrive folder path to set later in
	// `details.OneDriveInfo`
	parentPath, err := path.GetDriveFolderPath(oc.folderPath)
	if err != nil {
		oc.reportAsCompleted(ctx, 0, 0, 0)
		return
	}

	queuedPath := oc.handler.FormatDisplayPath(oc.driveName, parentPath)

	folderProgress := observe.ProgressWithCount(
		ctx,
		observe.ItemQueueMsg,
		path.NewElements(queuedPath),
		int64(len(oc.driveItems)))
	defer close(folderProgress)

	semaphoreCh := make(chan struct{}, graph.Parallelism(path.OneDriveService).Item())
	defer close(semaphoreCh)

	for _, item := range oc.driveItems {
		if errs.Failure() != nil {
			break
		}

		semaphoreCh <- struct{}{}

		wg.Add(1)

		go func(item models.DriveItemable) {
			defer wg.Done()
			defer func() { <-semaphoreCh }()

			// Read the item
			oc.streamDriveItem(
				ctx,
				parentPath,
				item,
				&stats,
				oc.ctrl.ItemExtensionFactory,
				errs)

			folderProgress <- struct{}{}
		}(item)
	}

	wg.Wait()

	oc.reportAsCompleted(ctx, int(stats.itemsFound), int(stats.itemsRead), stats.byteCount)
}

type lazyItemGetter struct {
	info                 *details.ItemInfo
	item                 models.DriveItemable
	driveID              string
	suffix               string
	itemExtensionFactory []extensions.CreateItemExtensioner
	contentGetter        func(
		ctx context.Context,
		driveID string,
		item models.DriveItemable,
		errs *fault.Bus) (io.ReadCloser, error)
}

func (lig *lazyItemGetter) GetData(
	ctx context.Context,
	errs *fault.Bus,
) (io.ReadCloser, *details.ItemInfo, bool, error) {
	rc, err := lig.contentGetter(ctx, lig.driveID, lig.item, errs)
	if err != nil {
		return nil, nil, false, clues.Stack(err)
	}

	extRc, extData, err := extensions.AddItemExtensions(
		ctx,
		rc,
		*lig.info,
		lig.itemExtensionFactory)
	if err != nil {
		err := clues.Wrap(err, "adding extensions").
			WithClues(ctx).
			Label(fault.LabelForceNoBackupCreation)

		return nil, nil, false, err
	}

	lig.info.Extension.Data = extData.Data

	// display/log the item download
	progReader, _ := observe.ItemProgress(
		ctx,
		extRc,
		observe.ItemBackupMsg,
		clues.Hide(ptr.Val(lig.item.GetName())+lig.suffix),
		ptr.Val(lig.item.GetSize()))

	return progReader, lig.info, false, nil
}

func (oc *Collection) streamDriveItem(
	ctx context.Context,
	parentPath *path.Builder,
	item models.DriveItemable,
	stats *driveStats,
	itemExtensionFactory []extensions.CreateItemExtensioner,
	errs *fault.Bus,
) {
	var (
		itemID       = ptr.Val(item.GetId())
		itemName     = ptr.Val(item.GetName())
		itemSize     = ptr.Val(item.GetSize())
		itemInfo     details.ItemInfo
		itemMeta     io.ReadCloser
		itemMetaSize int
		metaFileName string
		metaSuffix   string
		err          error
	)

	ctx = clues.Add(
		ctx,
		"item_id", itemID,
		"item_name", clues.Hide(itemName),
		"item_size", itemSize)

	item.SetParentReference(setName(item.GetParentReference(), oc.driveName))

	isFile := item.GetFile() != nil

	if isFile {
		atomic.AddInt64(&stats.itemsFound, 1)

		metaFileName = itemID
		metaSuffix = metadata.MetaFileSuffix
	} else {
		atomic.AddInt64(&stats.dirsFound, 1)

		// metaFileName not set for directories so we get just ".dirmeta"
		metaSuffix = metadata.DirMetaFileSuffix
	}

	// Fetch metadata for the item
	itemMeta, itemMetaSize, err = downloadItemMeta(ctx, oc.handler, oc.driveID, item)
	if err != nil {
		// Skip deleted items
		if !clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) && !graph.IsErrDeletedInFlight(err) {
			errs.AddRecoverable(ctx, clues.Wrap(err, "getting item metadata").Label(fault.LabelForceNoBackupCreation))
		}

		return
	}

	itemInfo = oc.handler.AugmentItemInfo(
		itemInfo,
		oc.protectedResource,
		item,
		itemSize,
		parentPath)

	ctx = clues.Add(ctx, "item_info", itemInfo)
	// Drive content download requests are also rate limited by graph api.
	// Ensure that this request goes through the drive limiter & not the default
	// limiter.
	ctx = graph.BindRateLimiterConfig(
		ctx,
		graph.LimiterCfg{
			Service: path.OneDriveService,
		})

	if isFile {
		dataSuffix := metadata.DataFileSuffix

		// Use a LazyItem to feed to the collection consumer.
		// This ensures that downloads won't be attempted unless that consumer
		// attempts to read bytes.  Assumption is that kopia will check things
		// like file modtimes before attempting to read.
		oc.data <- data.NewLazyItemWithInfo(
			ctx,
			&lazyItemGetter{
				info:                 &itemInfo,
				item:                 item,
				driveID:              oc.driveID,
				itemExtensionFactory: itemExtensionFactory,
				contentGetter:        oc.getDriveItemContent,
				suffix:               dataSuffix,
			},
			itemID+dataSuffix,
			itemInfo.Modified(),
			errs)
	}

	metaReader := lazy.NewLazyReadCloser(func() (io.ReadCloser, error) {
		progReader, _ := observe.ItemProgress(
			ctx,
			itemMeta,
			observe.ItemBackupMsg,
			clues.Hide(itemName+metaSuffix),
			int64(itemMetaSize))
		return progReader, nil
	})

	storeItem, err := data.NewPrefetchedItem(
		metaReader,
		metaFileName+metaSuffix,
		// Metadata file should always use the latest time as
		// permissions change does not update mod time.
		time.Now())
	if err != nil {
		errs.AddRecoverable(ctx, clues.Stack(err).
			WithClues(ctx).
			Label(fault.LabelForceNoBackupCreation))

		return
	}

	// We wrap the reader with a lazy reader so that the progress bar is only
	// initialized if the file is read. Since we're not actually lazily reading
	// data just use the eager item implementation.
	oc.data <- storeItem

	// Item read successfully, add to collection
	if isFile {
		atomic.AddInt64(&stats.itemsRead, 1)
	} else {
		atomic.AddInt64(&stats.dirsRead, 1)
	}

	atomic.AddInt64(&stats.byteCount, itemSize)
}

func (oc *Collection) reportAsCompleted(ctx context.Context, itemsFound, itemsRead int, byteCount int64) {
	close(oc.data)

	status := support.CreateStatus(ctx, support.Backup,
		1, // num folders (always 1)
		support.CollectionMetrics{
			Objects:   itemsFound,
			Successes: itemsRead,
			Bytes:     byteCount,
		},
		oc.folderPath.Folder(false))

	logger.Ctx(ctx).Debugw("done streaming items", "status", status.String())

	oc.statusUpdater(status)
}