corso/src/internal/connector/sharepoint/collection.go

package sharepoint

import (
	"bytes"
	"context"
	"io"
	"time"

	"github.com/alcionai/clues"
	absser "github.com/microsoft/kiota-abstractions-go/serialization"
	kw "github.com/microsoft/kiota-serialization-json-go"

	"github.com/alcionai/corso/src/internal/common/ptr"
	"github.com/alcionai/corso/src/internal/connector/discovery/api"
	"github.com/alcionai/corso/src/internal/connector/graph"
	sapi "github.com/alcionai/corso/src/internal/connector/sharepoint/api"
	"github.com/alcionai/corso/src/internal/connector/support"
	"github.com/alcionai/corso/src/internal/data"
	"github.com/alcionai/corso/src/internal/observe"
	"github.com/alcionai/corso/src/pkg/backup/details"
	"github.com/alcionai/corso/src/pkg/control"
	"github.com/alcionai/corso/src/pkg/fault"
	"github.com/alcionai/corso/src/pkg/logger"
	"github.com/alcionai/corso/src/pkg/path"
)

type DataCategory int

//go:generate stringer -type=DataCategory
const (
	collectionChannelBufferSize              = 50
	fetchChannelSize                         = 5
	Unknown                     DataCategory = iota
	List
	Drive
	Pages
)

var (
	_ data.BackupCollection = &Collection{}
	_ data.Stream           = &Item{}
	_ data.StreamInfo       = &Item{}
	_ data.StreamModTime    = &Item{}
)

type numMetrics struct {
	attempts   int
	success    int
	totalBytes int64
}

// Collection is the SharePoint.List implementation of data.Collection. SharePoint.Libraries collections are supported
// by the oneDrive.Collection as the calls are identical for populating the Collection
type Collection struct {
	// data is the container for each individual SharePoint.List
	data chan data.Stream
	// fullPath indicates the hierarchy within the collection
	fullPath path.Path
	// jobs contain the SharePoint.Site.ListIDs for the associated list(s).
	jobs []string
	// M365 IDs of the items of this collection
	category      DataCategory
	service       graph.Servicer
	ctrl          control.Options
	betaService   *api.BetaService
	statusUpdater support.StatusUpdater
}

// NewCollection helper function for creating a Collection
func NewCollection(
	folderPath path.Path,
	service graph.Servicer,
	category DataCategory,
	statusUpdater support.StatusUpdater,
	ctrlOpts control.Options,
) *Collection {
	c := &Collection{
		fullPath:      folderPath,
		jobs:          make([]string, 0),
		data:          make(chan data.Stream, collectionChannelBufferSize),
		service:       service,
		statusUpdater: statusUpdater,
		category:      category,
		ctrl:          ctrlOpts,
	}

	return c
}

// AddJob appends additional objectID to job field
func (sc *Collection) AddJob(objID string) {
	sc.jobs = append(sc.jobs, objID)
}

func (sc *Collection) FullPath() path.Path {
	return sc.fullPath
}

// TODO(ashmrtn): Fill in with previous path once GraphConnector compares old
// and new folder hierarchies.
func (sc Collection) PreviousPath() path.Path {
	return nil
}

func (sc Collection) State() data.CollectionState {
	return data.NewState
}

func (sc Collection) DoNotMergeItems() bool {
	return false
}

func (sc *Collection) Items(
	ctx context.Context,
	errs *fault.Errors,
) <-chan data.Stream {
	go sc.populate(ctx, errs)
	return sc.data
}

type Item struct {
	id      string
	data    io.ReadCloser
	info    *details.SharePointInfo
	modTime time.Time

	// true if the item was marked by graph as deleted.
	deleted bool
}

func NewItem(name string, d io.ReadCloser) *Item {
	return &Item{
		id:   name,
		data: d,
	}
}

func (sd *Item) UUID() string {
	return sd.id
}

func (sd *Item) ToReader() io.ReadCloser {
	return sd.data
}

func (sd Item) Deleted() bool {
	return sd.deleted
}

func (sd *Item) Info() details.ItemInfo {
	return details.ItemInfo{SharePoint: sd.info}
}

func (sd *Item) ModTime() time.Time {
	return sd.modTime
}

func (sc *Collection) finishPopulation(
	ctx context.Context,
	attempts, success int,
	totalBytes int64,
	err error,
) {
	close(sc.data)

	attempted := attempts
	status := support.CreateStatus(
		ctx,
		support.Backup,
		1, // 1 folder
		support.CollectionMetrics{
			Objects:    attempted,
			Successes:  success,
			TotalBytes: totalBytes,
		},
		err,
		sc.fullPath.Folder(false))
	logger.Ctx(ctx).Debug(status.String())

	if sc.statusUpdater != nil {
		sc.statusUpdater(status)
	}
}

// populate utility function to retrieve data from back store for a given collection
func (sc *Collection) populate(ctx context.Context, errs *fault.Errors) {
	var (
		metrics numMetrics
		writer  = kw.NewJsonSerializationWriter()
		err     error
	)

	defer func() {
		sc.finishPopulation(ctx, metrics.attempts, metrics.success, int64(metrics.totalBytes), err)
	}()

	// TODO: Insert correct ID for CollectionProgress
	colProgress, closer := observe.CollectionProgress(
		ctx,
		sc.fullPath.Category().String(),
		observe.Safe("name"),
		observe.PII(sc.fullPath.Folder(false)))
	go closer()

	defer func() {
		close(colProgress)
	}()

	// Switch retrieval function based on category
	switch sc.category {
	case List:
		metrics, err = sc.retrieveLists(ctx, writer, colProgress, errs)
	case Pages:
		metrics, err = sc.retrievePages(ctx, writer, colProgress, errs)
	}
}

// retrieveLists utility function for collection that downloads and serializes
// models.Listable objects based on M365 IDs from the jobs field.
func (sc *Collection) retrieveLists(
	ctx context.Context,
	wtr *kw.JsonSerializationWriter,
	progress chan<- struct{},
	errs *fault.Errors,
) (numMetrics, error) {
	var metrics numMetrics

	lists, err := loadSiteLists(ctx, sc.service, sc.fullPath.ResourceOwner(), sc.jobs, errs)
	if err != nil {
		return metrics, err
	}

	metrics.attempts += len(lists)
	// For each models.Listable, object is serialized and the metrics are collected.
	// The progress is objected via the passed in channel.
	for _, lst := range lists {
		if errs.Err() != nil {
			break
		}

		byteArray, err := serializeContent(wtr, lst)
		if err != nil {
			errs.Add(clues.Wrap(err, "serializing list").WithClues(ctx))
			continue
		}

		size := int64(len(byteArray))

		if size > 0 {
			t := time.Now()
			if t1 := lst.GetLastModifiedDateTime(); t1 != nil {
				t = *t1
			}

			metrics.totalBytes += size

			metrics.success++
			sc.data <- &Item{
				id:      *lst.GetId(),
				data:    io.NopCloser(bytes.NewReader(byteArray)),
				info:    sharePointListInfo(lst, size),
				modTime: t,
			}

			progress <- struct{}{}
		}
	}

	return metrics, errs.Err()
}

func (sc *Collection) retrievePages(
	ctx context.Context,
	wtr *kw.JsonSerializationWriter,
	progress chan<- struct{},
	errs *fault.Errors,
) (numMetrics, error) {
	var metrics numMetrics

	betaService := sc.betaService
	if betaService == nil {
		return metrics, clues.New("beta service required").WithClues(ctx)
	}

	pages, err := sapi.GetSitePages(ctx, betaService, sc.fullPath.ResourceOwner(), sc.jobs, errs)
	if err != nil {
		return metrics, err
	}

	metrics.attempts = len(pages)
	// For each models.Pageable, object is serialize and the metrics are collected and returned.
	// Pageable objects are not supported in v1.0 of msgraph at this time.
	// TODO: Verify Parsable interface supported with modified-Pageable
	for _, pg := range pages {
		if errs.Err() != nil {
			break
		}

		byteArray, err := serializeContent(wtr, pg)
		if err != nil {
			errs.Add(clues.Wrap(err, "serializing page").WithClues(ctx))
			continue
		}

		size := int64(len(byteArray))

		if size > 0 {
			metrics.totalBytes += size
			metrics.success++
			sc.data <- &Item{
				id:      *pg.GetId(),
				data:    io.NopCloser(bytes.NewReader(byteArray)),
				info:    sapi.PageInfo(pg, size),
				modTime: ptr.OrNow(pg.GetLastModifiedDateTime()),
			}

			progress <- struct{}{}
		}
	}

	return metrics, errs.Err()
}

func serializeContent(writer *kw.JsonSerializationWriter, obj absser.Parsable) ([]byte, error) {
	defer writer.Close()

	err := writer.WriteObjectValue("", obj)
	if err != nil {
		return nil, clues.Wrap(err, "writing object").With(graph.ErrData(err)...)
	}

	byteArray, err := writer.GetSerializedContent()
	if err != nil {
		return nil, clues.Wrap(err, "getting content from writer").With(graph.ErrData(err)...)
	}

	return byteArray, nil
}