Abhishek Pandey 127570953a
Add prefetch collection for groups (#4906)
<!-- PR description-->

Minor refactor before introducing `lazyFetchCollection` for groups. We'll utilize `lazyFetchCollection` for group mailboxes and will continue to use `prefetchCollection` for channels.


---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No

#### Type of change

<!--- Please check the type of change your PR introduces: --->
- [ ] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [x] 🧹 Tech Debt/Cleanup

#### Issue(s)

<!-- Can reference multiple issues. Use one of the following "magic words" - "closes, fixes" to auto-close the Github issue. -->
* https://github.com/alcionai/corso/issues/4862

#### Test Plan

<!-- How will this be tested prior to merging.-->
- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
2023-12-22 23:18:58 +00:00

241 lines
6.2 KiB
Go

package groups
import (
"bytes"
"context"
"io"
"sync"
"sync/atomic"
"time"
"github.com/alcionai/clues"
kjson "github.com/microsoft/kiota-serialization-json-go"
"github.com/alcionai/corso/src/internal/data"
"github.com/alcionai/corso/src/internal/m365/support"
"github.com/alcionai/corso/src/internal/observe"
"github.com/alcionai/corso/src/pkg/backup/details"
"github.com/alcionai/corso/src/pkg/count"
"github.com/alcionai/corso/src/pkg/fault"
"github.com/alcionai/corso/src/pkg/logger"
"github.com/alcionai/corso/src/pkg/services/m365/api/graph"
)
var _ data.BackupCollection = &prefetchCollection[graph.GetIDer, groupsItemer]{}
const (
collectionChannelBufferSize = 1000
numberOfRetries = 4
)
type prefetchCollection[C graph.GetIDer, I groupsItemer] struct {
data.BaseCollection
protectedResource string
stream chan data.Item
contains container[C]
// added is a list of existing item IDs that were added to a container
added map[string]time.Time
// removed is a list of item IDs that were deleted from, or moved out, of a container
removed map[string]struct{}
getAndAugment getItemAndAugmentInfoer[C, I]
statusUpdater support.StatusUpdater
}
// NewExchangeDataCollection creates an ExchangeDataCollection.
// State of the collection is set as an observation of the current
// and previous paths. If the curr path is nil, the state is assumed
// to be deleted. If the prev path is nil, it is assumed newly created.
// If both are populated, then state is either moved (if they differ),
// or notMoved (if they match).
func NewCollection[C graph.GetIDer, I groupsItemer](
baseCol data.BaseCollection,
getAndAugment getItemAndAugmentInfoer[C, I],
protectedResource string,
added map[string]time.Time,
removed map[string]struct{},
contains container[C],
statusUpdater support.StatusUpdater,
) prefetchCollection[C, I] {
collection := prefetchCollection[C, I]{
BaseCollection: baseCol,
added: added,
contains: contains,
getAndAugment: getAndAugment,
removed: removed,
statusUpdater: statusUpdater,
stream: make(chan data.Item, collectionChannelBufferSize),
protectedResource: protectedResource,
}
return collection
}
// Items utility function to asynchronously execute process to fill data channel with
// M365 exchange objects and returns the data channel
func (col *prefetchCollection[C, I]) Items(ctx context.Context, errs *fault.Bus) <-chan data.Item {
go col.streamItems(ctx, errs)
return col.stream
}
// ---------------------------------------------------------------------------
// items() production
// ---------------------------------------------------------------------------
func (col *prefetchCollection[C, I]) streamItems(ctx context.Context, errs *fault.Bus) {
var (
streamedItems int64
totalBytes int64
wg sync.WaitGroup
colProgress chan<- struct{}
el = errs.Local()
)
ctx = clues.Add(ctx, "category", col.Category().String())
defer func() {
logger.Ctx(ctx).Infow(
"finished stream backup collection items",
"stats", col.Counter.Values())
col.finishPopulation(ctx, streamedItems, totalBytes, errs.Failure())
}()
if len(col.added)+len(col.removed) > 0 {
colProgress = observe.CollectionProgress(
ctx,
col.Category().HumanString(),
col.LocationPath().Elements())
defer close(colProgress)
}
semaphoreCh := make(chan struct{}, col.Opts().Parallelism.ItemFetch)
defer close(semaphoreCh)
// delete all removed items
for id := range col.removed {
semaphoreCh <- struct{}{}
wg.Add(1)
go func(id string) {
defer wg.Done()
defer func() { <-semaphoreCh }()
col.stream <- data.NewDeletedItem(id)
atomic.AddInt64(&streamedItems, 1)
col.Counter.Inc(count.StreamItemsRemoved)
if colProgress != nil {
colProgress <- struct{}{}
}
}(id)
}
// add any new items
for id := range col.added {
if el.Failure() != nil {
break
}
wg.Add(1)
semaphoreCh <- struct{}{}
go func(id string) {
defer wg.Done()
defer func() { <-semaphoreCh }()
writer := kjson.NewJsonSerializationWriter()
defer writer.Close()
item, info, err := col.getAndAugment.getItem(
ctx,
col.protectedResource,
col.FullPath().Folders(),
id)
if err != nil {
err = clues.Wrap(err, "getting channel message data").Label(fault.LabelForceNoBackupCreation)
el.AddRecoverable(ctx, err)
return
}
col.getAndAugment.augmentItemInfo(info, col.contains.container)
if err := writer.WriteObjectValue("", item); err != nil {
err = clues.Wrap(err, "writing channel message to serializer").Label(fault.LabelForceNoBackupCreation)
el.AddRecoverable(ctx, err)
return
}
itemData, err := writer.GetSerializedContent()
if err != nil {
err = clues.Wrap(err, "serializing channel message").Label(fault.LabelForceNoBackupCreation)
el.AddRecoverable(ctx, err)
return
}
info.ParentPath = col.LocationPath().String()
storeItem, err := data.NewPrefetchedItemWithInfo(
io.NopCloser(bytes.NewReader(itemData)),
id,
details.ItemInfo{Groups: info})
if err != nil {
err := clues.StackWC(ctx, err).Label(fault.LabelForceNoBackupCreation)
el.AddRecoverable(ctx, err)
return
}
col.stream <- storeItem
atomic.AddInt64(&streamedItems, 1)
atomic.AddInt64(&totalBytes, info.Size)
if col.Counter.Inc(count.StreamItemsAdded)%1000 == 0 {
logger.Ctx(ctx).Infow("item stream progress", "stats", col.Counter.Values())
}
col.Counter.Add(count.StreamBytesAdded, info.Size)
if colProgress != nil {
colProgress <- struct{}{}
}
}(id)
}
wg.Wait()
}
// finishPopulation is a utility function used to close a collection's data channel
// and to send the status update through the channel.
func (col *prefetchCollection[C, I]) finishPopulation(
ctx context.Context,
streamedItems, totalBytes int64,
err error,
) {
close(col.stream)
attempted := len(col.added) + len(col.removed)
status := support.CreateStatus(
ctx,
support.Backup,
1,
support.CollectionMetrics{
Objects: attempted,
Successes: int(streamedItems),
Bytes: totalBytes,
},
col.FullPath().Folder(false))
logger.Ctx(ctx).Debugw("done streaming items", "status", status.String())
col.statusUpdater(status)
}