clean up operation run-do funcs (#2391)

## Description

The operation run-do pattern is currently spaghetti,
The do() call includes a deferred persistence that
occurs before the Run call concludes, causing
us to need panic handlers in multiple levels.

This change normalizes the interacton: do() now
only contains the behavior necessary to process
the backup or restore.  Run() contains all setup
and teardown processes surrounding that.

General pattern looks like this:

Run()
    0. defer panic recovery
    1. create state builders/recorder vars/clients
    2. call do()
    3. persist results of do(), even in case of error.
    
do()
    process step-by-step backup or restore operation
    update builders/recorders along the way
    exit immediately on any error

## Does this PR need a docs update or release note?

- [x]  No 

## Type of change

- [x] 🧹 Tech Debt/Cleanup

## Test Plan

- [x]  Unit test
- [x] 💚 E2E
This commit is contained in:
Keepers 2023-02-03 17:11:59 -07:00 committed by GitHub
parent 38f56cccba
commit 8d04957e5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 192 additions and 157 deletions

View File

@ -2,6 +2,7 @@ package operations
import (
"context"
"fmt"
"runtime/debug"
"time"
@ -110,7 +111,21 @@ type detailsWriter interface {
func (op *BackupOperation) Run(ctx context.Context) (err error) {
defer func() {
if r := recover(); r != nil {
err = clues.Wrap(r.(error), "panic recovery").WithClues(ctx).With("stacktrace", debug.Stack())
var rerr error
if re, ok := r.(error); ok {
rerr = re
} else if re, ok := r.(string); ok {
rerr = clues.New(re)
} else {
rerr = clues.New(fmt.Sprintf("%v", r))
}
err = clues.Wrap(rerr, "panic recovery").
WithClues(ctx).
With("stacktrace", string(debug.Stack()))
logger.Ctx(ctx).
With("err", err).
Errorw("backup panic", clues.InErr(err).Slice()...)
}
}()
@ -121,6 +136,18 @@ func (op *BackupOperation) Run(ctx context.Context) (err error) {
observe.Complete()
}()
// -----
// Setup
// -----
var (
opStats backupStats
startTime = time.Now()
detailsStore = streamstore.New(op.kopia, op.account.ID(), op.Selectors.PathService())
)
op.Results.BackupID = model.StableID(uuid.NewString())
ctx = clues.AddAll(
ctx,
"tenant_id", op.account.ID(), // TODO: pii
@ -129,32 +156,6 @@ func (op *BackupOperation) Run(ctx context.Context) (err error) {
"service", op.Selectors.Service,
"incremental", op.incremental)
if err := op.do(ctx); err != nil {
logger.Ctx(ctx).
With("err", err).
Errorw("backup operation", clues.InErr(err).Slice()...)
return err
}
logger.Ctx(ctx).Infow("completed backup", "results", op.Results)
return nil
}
func (op *BackupOperation) do(ctx context.Context) (err error) {
var (
opStats backupStats
backupDetails *details.Builder
toMerge map[string]path.Path
tenantID = op.account.ID()
startTime = time.Now()
detailsStore = streamstore.New(op.kopia, tenantID, op.Selectors.PathService())
reasons = selectorToReasons(op.Selectors)
)
op.Results.BackupID = model.StableID(uuid.NewString())
op.bus.Event(
ctx,
events.BackupStart,
@ -162,122 +163,128 @@ func (op *BackupOperation) do(ctx context.Context) (err error) {
events.StartTime: startTime,
events.Service: op.Selectors.Service.String(),
events.BackupID: op.Results.BackupID,
},
)
})
// persist operation results to the model store on exit
defer func() {
// panic recovery here prevents additional errors in op.persistResults()
if r := recover(); r != nil {
err = clues.Wrap(r.(error), "panic recovery").WithClues(ctx).With("stacktrace", debug.Stack())
return
}
// -----
// Execution
// -----
err = op.persistResults(startTime, &opStats)
if err != nil {
op.Errors.Fail(errors.Wrap(err, "persisting backup results"))
return
}
deets, err := op.do(
ctx,
&opStats,
detailsStore,
op.Results.BackupID)
if err != nil {
// No return here! We continue down to persistResults, even in case of failure.
logger.Ctx(ctx).
With("err", err).
Errorw("doing backup", clues.InErr(err).Slice()...)
op.Errors.Fail(errors.Wrap(err, "doing backup"))
opStats.readErr = op.Errors.Err()
}
err = op.createBackupModels(
ctx,
detailsStore,
opStats.k.SnapshotID,
backupDetails.Details())
if err != nil {
op.Errors.Fail(errors.Wrap(err, "persisting backup"))
opStats.writeErr = op.Errors.Err()
}
}()
// -----
// Persistence
// -----
err = op.persistResults(startTime, &opStats)
if err != nil {
op.Errors.Fail(errors.Wrap(err, "persisting backup results"))
opStats.writeErr = op.Errors.Err()
return op.Errors.Err()
}
err = op.createBackupModels(
ctx,
detailsStore,
opStats.k.SnapshotID,
op.Results.BackupID,
deets.Details())
if err != nil {
op.Errors.Fail(errors.Wrap(err, "persisting backup"))
opStats.writeErr = op.Errors.Err()
return op.Errors.Err()
}
logger.Ctx(ctx).Infow("completed backup", "results", op.Results)
return nil
}
// do is purely the action of running a backup. All pre/post behavior
// is found in Run().
func (op *BackupOperation) do(
ctx context.Context,
opStats *backupStats,
detailsStore detailsReader,
backupID model.StableID,
) (*details.Builder, error) {
reasons := selectorToReasons(op.Selectors)
// should always be 1, since backups are 1:1 with resourceOwners.
opStats.resourceCount = 1
mans, mdColls, canUseMetaData, err := produceManifestsAndMetadata(
ctx,
op.kopia,
op.store,
reasons,
tenantID,
op.account.ID(),
op.incremental,
op.Errors)
if err != nil {
op.Errors.Fail(errors.Wrap(err, "collecting manifest heuristics"))
opStats.readErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("producing manifests and metadata", clues.InErr(err).Slice()...)
return opStats.readErr
return nil, errors.Wrap(err, "producing manifests and metadata")
}
gc, err := connectToM365(ctx, op.Selectors, op.account)
if err != nil {
op.Errors.Fail(errors.Wrap(err, "connecting to m365"))
opStats.readErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("connectng to m365", clues.InErr(err).Slice()...)
return opStats.readErr
return nil, errors.Wrap(err, "connectng to m365")
}
cs, err := produceBackupDataCollections(ctx, gc, op.Selectors, mdColls, op.Options)
if err != nil {
op.Errors.Fail(errors.Wrap(err, "retrieving data to backup"))
opStats.readErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("producing backup data collections", clues.InErr(err).Slice()...)
return opStats.readErr
return nil, errors.Wrap(err, "producing backup data collections")
}
ctx = clues.Add(ctx, "coll_count", len(cs))
opStats.k, backupDetails, toMerge, err = consumeBackupDataCollections(
writeStats, deets, toMerge, err := consumeBackupDataCollections(
ctx,
op.kopia,
tenantID,
op.account.ID(),
reasons,
mans,
cs,
op.Results.BackupID,
backupID,
op.incremental && canUseMetaData)
if err != nil {
op.Errors.Fail(errors.Wrap(err, "backing up service data"))
opStats.writeErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("persisting collection backups", clues.InErr(err).Slice()...)
return opStats.writeErr
return nil, errors.Wrap(err, "persisting collection backups")
}
if err = mergeDetails(
opStats.k = writeStats
err = mergeDetails(
ctx,
op.store,
detailsStore,
mans,
toMerge,
backupDetails,
); err != nil {
op.Errors.Fail(errors.Wrap(err, "merging backup details"))
opStats.writeErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("merging details", clues.InErr(err).Slice()...)
return opStats.writeErr
deets)
if err != nil {
return nil, errors.Wrap(err, "merging details")
}
opStats.gc = gc.AwaitStatus()
// TODO(keepers): remove when fault.Errors handles all iterable error aggregation.
if opStats.gc.ErrorCount > 0 {
merr := multierror.Append(opStats.readErr, errors.Wrap(opStats.gc.Err, "retrieving data"))
opStats.readErr = merr.ErrorOrNil()
// Need to exit before we set started to true else we'll report no errors.
return opStats.readErr
return nil, opStats.gc.Err
}
// should always be 1, since backups are 1:1 with resourceOwners.
opStats.resourceCount = 1
logger.Ctx(ctx).Debug(gc.PrintableStatus())
return err
return deets, nil
}
// checker to see if conditions are correct for incremental backup behavior such as
@ -520,8 +527,7 @@ func mergeDetails(
ctx,
model.StableID(bID),
ms,
detailsStore,
)
detailsStore)
if err != nil {
return clues.New("fetching base details for backup").WithClues(mctx)
}
@ -566,8 +572,7 @@ func mergeDetails(
newPath.ShortRef(),
newPath.ToBuilder().Dir().ShortRef(),
itemUpdated,
item,
)
item)
folders := details.FolderEntriesForPath(newPath.ToBuilder().Dir())
deets.AddFoldersForItem(folders, item, itemUpdated)
@ -617,10 +622,10 @@ func (op *BackupOperation) persistResults(
if opStats.gc == nil {
op.Status = Failed
return errors.New("data population never completed")
return errors.New("backup population never completed")
}
if opStats.readErr == nil && opStats.writeErr == nil && opStats.gc.Successful == 0 {
if opStats.gc.Successful == 0 {
op.Status = NoData
}
@ -634,6 +639,7 @@ func (op *BackupOperation) createBackupModels(
ctx context.Context,
detailsStore detailsWriter,
snapID string,
backupID model.StableID,
backupDetails *details.Details,
) error {
ctx = clues.Add(ctx, "snapshot_id", snapID)
@ -650,7 +656,7 @@ func (op *BackupOperation) createBackupModels(
ctx = clues.Add(ctx, "details_id", detailsID)
b := backup.New(
snapID, detailsID, op.Status.String(),
op.Results.BackupID,
backupID,
op.Selectors,
op.Results.ReadWrites,
op.Results.StartAndEndTime,

View File

@ -110,10 +110,37 @@ type restorer interface {
func (op *RestoreOperation) Run(ctx context.Context) (restoreDetails *details.Details, err error) {
defer func() {
if r := recover(); r != nil {
err = clues.Wrap(r.(error), "panic recovery").WithClues(ctx).With("stacktrace", debug.Stack())
var rerr error
if re, ok := r.(error); ok {
rerr = re
} else if re, ok := r.(string); ok {
rerr = clues.New(re)
} else {
rerr = clues.New(fmt.Sprintf("%v", r))
}
err = clues.Wrap(rerr, "panic recovery").
WithClues(ctx).
With("stacktrace", string(debug.Stack()))
logger.Ctx(ctx).
With("err", err).
Errorw("backup panic", clues.InErr(err).Slice()...)
}
}()
var (
opStats = restoreStats{
bytesRead: &stats.ByteCounter{},
restoreID: uuid.NewString(),
}
start = time.Now()
detailsStore = streamstore.New(op.kopia, op.account.ID(), op.Selectors.PathService())
)
// -----
// Setup
// -----
ctx, end := D.Span(ctx, "operations:restore:run")
defer func() {
end()
@ -127,13 +154,30 @@ func (op *RestoreOperation) Run(ctx context.Context) (restoreDetails *details.De
"backup_id", op.BackupID,
"service", op.Selectors.Service)
deets, err := op.do(ctx)
// -----
// Execution
// -----
deets, err := op.do(ctx, &opStats, detailsStore, start)
if err != nil {
// No return here! We continue down to persistResults, even in case of failure.
logger.Ctx(ctx).
With("err", err).
Errorw("restore operation", clues.InErr(err).Slice()...)
Errorw("doing restore", clues.InErr(err).Slice()...)
op.Errors.Fail(errors.Wrap(err, "doing restore"))
opStats.readErr = op.Errors.Err()
}
return nil, err
// -----
// Persistence
// -----
err = op.persistResults(ctx, start, &opStats)
if err != nil {
op.Errors.Fail(errors.Wrap(err, "persisting restore results"))
opStats.writeErr = op.Errors.Err()
return nil, op.Errors.Err()
}
logger.Ctx(ctx).Infow("completed restore", "results", op.Results)
@ -141,30 +185,12 @@ func (op *RestoreOperation) Run(ctx context.Context) (restoreDetails *details.De
return deets, nil
}
func (op *RestoreOperation) do(ctx context.Context) (restoreDetails *details.Details, err error) {
var (
opStats = restoreStats{
bytesRead: &stats.ByteCounter{},
restoreID: uuid.NewString(),
}
startTime = time.Now()
)
defer func() {
// panic recovery here prevents additional errors in op.persistResults()
if r := recover(); r != nil {
err = clues.Wrap(r.(error), "panic recovery").WithClues(ctx).With("stacktrace", debug.Stack())
return
}
err = op.persistResults(ctx, startTime, &opStats)
if err != nil {
return
}
}()
detailsStore := streamstore.New(op.kopia, op.account.ID(), op.Selectors.PathService())
func (op *RestoreOperation) do(
ctx context.Context,
opStats *restoreStats,
detailsStore detailsReader,
start time.Time,
) (*details.Details, error) {
bup, deets, err := getBackupAndDetailsFromID(
ctx,
op.BackupID,
@ -172,30 +198,29 @@ func (op *RestoreOperation) do(ctx context.Context) (restoreDetails *details.Det
detailsStore,
)
if err != nil {
opStats.readErr = errors.Wrap(err, "restore")
return nil, opStats.readErr
return nil, errors.Wrap(err, "getting backup and details")
}
ctx = clues.Add(ctx, "resource_owner", bup.Selector.DiscreteOwner)
paths, err := formatDetailsForRestoration(ctx, op.Selectors, deets)
if err != nil {
return nil, errors.Wrap(err, "formatting paths from details")
}
ctx = clues.AddAll(
ctx,
"resource_owner", bup.Selector.DiscreteOwner,
"details_paths", len(paths))
op.bus.Event(
ctx,
events.RestoreStart,
map[string]any{
events.StartTime: startTime,
events.StartTime: start,
events.BackupID: op.BackupID,
events.BackupCreateTime: bup.CreationTime,
events.RestoreID: opStats.restoreID,
},
)
})
paths, err := formatDetailsForRestoration(ctx, op.Selectors, deets)
if err != nil {
opStats.readErr = err
return nil, err
}
ctx = clues.Add(ctx, "details_paths", len(paths))
observe.Message(ctx, observe.Safe(fmt.Sprintf("Discovered %d items in backup %s to restore", len(paths), op.BackupID)))
kopiaComplete, closer := observe.MessageWithCompletion(ctx, observe.Safe("Enumerating items in repository"))
@ -204,41 +229,45 @@ func (op *RestoreOperation) do(ctx context.Context) (restoreDetails *details.Det
dcs, err := op.kopia.RestoreMultipleItems(ctx, bup.SnapshotID, paths, opStats.bytesRead)
if err != nil {
opStats.readErr = errors.Wrap(err, "retrieving service data")
return nil, opStats.readErr
return nil, errors.Wrap(err, "retrieving collections from repository")
}
kopiaComplete <- struct{}{}
ctx = clues.Add(ctx, "coll_count", len(dcs))
// should always be 1, since backups are 1:1 with resourceOwners.
opStats.resourceCount = 1
opStats.cs = dcs
opStats.resourceCount = len(data.ResourceOwnerSet(dcs))
gc, err := connectToM365(ctx, op.Selectors, op.account)
if err != nil {
opStats.readErr = errors.Wrap(err, "connecting to M365")
return nil, opStats.readErr
return nil, errors.Wrap(err, "connecting to M365")
}
restoreComplete, closer := observe.MessageWithCompletion(ctx, observe.Safe("Restoring data"))
defer closer()
defer close(restoreComplete)
restoreDetails, err = gc.RestoreDataCollections(
restoreDetails, err := gc.RestoreDataCollections(
ctx,
bup.Version,
op.account,
op.Selectors,
op.Destination,
op.Options,
dcs,
)
dcs)
if err != nil {
opStats.writeErr = errors.Wrap(err, "restoring service data")
return nil, opStats.writeErr
return nil, errors.Wrap(err, "restoring collections")
}
restoreComplete <- struct{}{}
opStats.gc = gc.AwaitStatus()
// TODO(keepers): remove when fault.Errors handles all iterable error aggregation.
if opStats.gc.ErrorCount > 0 {
return nil, opStats.gc.Err
}
logger.Ctx(ctx).Debug(gc.PrintableStatus())
@ -273,10 +302,10 @@ func (op *RestoreOperation) persistResults(
if opStats.gc == nil {
op.Status = Failed
return errors.New("data restoration never completed")
return errors.New("restoration never completed")
}
if opStats.readErr == nil && opStats.writeErr == nil && opStats.gc.Successful == 0 {
if opStats.gc.Successful == 0 {
op.Status = NoData
}