extra panic protection in operations (#2383)

## Does this PR need a docs update or release note?

- [x]  No 

## Type of change

- [x] 🧹 Tech Debt/Cleanup

## Test Plan

- [x] 💚 E2E
This commit is contained in:
Keepers 2023-02-02 19:01:42 -07:00 committed by GitHub
parent b00b41a6bd
commit 0436e0d128
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 52 additions and 11 deletions

View File

@ -2,6 +2,7 @@ package operations
import ( import (
"context" "context"
"runtime/debug"
"time" "time"
"github.com/alcionai/clues" "github.com/alcionai/clues"
@ -106,7 +107,13 @@ type detailsWriter interface {
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// Run begins a synchronous backup operation. // Run begins a synchronous backup operation.
func (op *BackupOperation) Run(ctx context.Context) error { func (op *BackupOperation) Run(ctx context.Context) (err error) {
defer func() {
if r := recover(); r != nil {
err = clues.Wrap(r.(error), "panic recovery").WithClues(ctx).With("stacktrace", debug.Stack())
}
}()
ctx, end := D.Span(ctx, "operations:backup:run") ctx, end := D.Span(ctx, "operations:backup:run")
defer func() { defer func() {
end() end()
@ -189,6 +196,8 @@ func (op *BackupOperation) do(ctx context.Context) (err error) {
op.Errors.Fail(errors.Wrap(err, "collecting manifest heuristics")) op.Errors.Fail(errors.Wrap(err, "collecting manifest heuristics"))
opStats.readErr = op.Errors.Err() opStats.readErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("producing manifests and metadata", clues.InErr(err).Slice()...)
return opStats.readErr return opStats.readErr
} }
@ -197,6 +206,8 @@ func (op *BackupOperation) do(ctx context.Context) (err error) {
op.Errors.Fail(errors.Wrap(err, "connecting to m365")) op.Errors.Fail(errors.Wrap(err, "connecting to m365"))
opStats.readErr = op.Errors.Err() opStats.readErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("connectng to m365", clues.InErr(err).Slice()...)
return opStats.readErr return opStats.readErr
} }
@ -205,6 +216,8 @@ func (op *BackupOperation) do(ctx context.Context) (err error) {
op.Errors.Fail(errors.Wrap(err, "retrieving data to backup")) op.Errors.Fail(errors.Wrap(err, "retrieving data to backup"))
opStats.readErr = op.Errors.Err() opStats.readErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("producing backup data collections", clues.InErr(err).Slice()...)
return opStats.readErr return opStats.readErr
} }
@ -223,6 +236,8 @@ func (op *BackupOperation) do(ctx context.Context) (err error) {
op.Errors.Fail(errors.Wrap(err, "backing up service data")) op.Errors.Fail(errors.Wrap(err, "backing up service data"))
opStats.writeErr = op.Errors.Err() opStats.writeErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("persisting collection backups", clues.InErr(err).Slice()...)
return opStats.writeErr return opStats.writeErr
} }
@ -237,6 +252,8 @@ func (op *BackupOperation) do(ctx context.Context) (err error) {
op.Errors.Fail(errors.Wrap(err, "merging backup details")) op.Errors.Fail(errors.Wrap(err, "merging backup details"))
opStats.writeErr = op.Errors.Err() opStats.writeErr = op.Errors.Err()
logger.Ctx(ctx).With("err", err).Errorw("merging details", clues.InErr(err).Slice()...)
return opStats.writeErr return opStats.writeErr
} }
@ -589,15 +606,21 @@ func (op *BackupOperation) persistResults(
opStats.writeErr) opStats.writeErr)
} }
op.Results.BytesRead = opStats.k.TotalHashedBytes
op.Results.BytesUploaded = opStats.k.TotalUploadedBytes
op.Results.ItemsWritten = opStats.k.TotalFileCount
op.Results.ResourceOwners = opStats.resourceCount
if opStats.gc == nil {
op.Status = Failed
return errors.New("data population never completed")
}
if opStats.readErr == nil && opStats.writeErr == nil && opStats.gc.Successful == 0 { if opStats.readErr == nil && opStats.writeErr == nil && opStats.gc.Successful == 0 {
op.Status = NoData op.Status = NoData
} }
op.Results.BytesRead = opStats.k.TotalHashedBytes
op.Results.BytesUploaded = opStats.k.TotalUploadedBytes
op.Results.ItemsRead = opStats.gc.Successful op.Results.ItemsRead = opStats.gc.Successful
op.Results.ItemsWritten = opStats.k.TotalFileCount
op.Results.ResourceOwners = opStats.resourceCount
return nil return nil
} }

View File

@ -3,6 +3,7 @@ package operations
import ( import (
"context" "context"
"fmt" "fmt"
"runtime/debug"
"time" "time"
"github.com/alcionai/clues" "github.com/alcionai/clues"
@ -106,6 +107,12 @@ type restorer interface {
// Run begins a synchronous restore operation. // Run begins a synchronous restore operation.
func (op *RestoreOperation) Run(ctx context.Context) (restoreDetails *details.Details, err error) { func (op *RestoreOperation) Run(ctx context.Context) (restoreDetails *details.Details, err error) {
defer func() {
if r := recover(); r != nil {
err = clues.Wrap(r.(error), "panic recovery").WithClues(ctx).With("stacktrace", debug.Stack())
}
}()
ctx, end := D.Span(ctx, "operations:restore:run") ctx, end := D.Span(ctx, "operations:restore:run")
defer func() { defer func() {
end() end()
@ -250,14 +257,20 @@ func (op *RestoreOperation) persistResults(
opStats.writeErr) opStats.writeErr)
} }
op.Results.BytesRead = opStats.bytesRead.NumBytes
op.Results.ItemsRead = len(opStats.cs) // TODO: file count, not collection count
op.Results.ResourceOwners = opStats.resourceCount
if opStats.gc == nil {
op.Status = Failed
return errors.New("data restoration never completed")
}
if opStats.readErr == nil && opStats.writeErr == nil && opStats.gc.Successful == 0 { if opStats.readErr == nil && opStats.writeErr == nil && opStats.gc.Successful == 0 {
op.Status = NoData op.Status = NoData
} }
op.Results.BytesRead = opStats.bytesRead.NumBytes
op.Results.ItemsRead = len(opStats.cs) // TODO: file count, not collection count
op.Results.ItemsWritten = opStats.gc.Successful op.Results.ItemsWritten = opStats.gc.Successful
op.Results.ResourceOwners = opStats.resourceCount
dur := op.Results.CompletedAt.Sub(op.Results.StartedAt) dur := op.Results.CompletedAt.Sub(op.Results.StartedAt)

View File

@ -87,11 +87,12 @@ func (e *Errors) Fail(err error) *Errors {
// setErr handles setting errors.err. Sync locking gets // setErr handles setting errors.err. Sync locking gets
// handled upstream of this call. // handled upstream of this call.
func (e *Errors) setErr(err error) *Errors { func (e *Errors) setErr(err error) *Errors {
if e.err != nil { if e.err == nil {
return e.addErr(err) e.err = err
return e
} }
e.err = err e.errs = append(e.errs, err)
return e return e
} }

View File

@ -73,6 +73,8 @@ func (suite *FaultErrorsUnitSuite) TestErr() {
suite.T().Run(test.name, func(t *testing.T) { suite.T().Run(test.name, func(t *testing.T) {
n := fault.New(test.failFast) n := fault.New(test.failFast)
require.NotNil(t, n) require.NotNil(t, n)
require.NoError(t, n.Err())
require.Empty(t, n.Errs())
e := n.Fail(test.fail) e := n.Fail(test.fail)
require.NotNil(t, e) require.NotNil(t, e)
@ -90,6 +92,8 @@ func (suite *FaultErrorsUnitSuite) TestFail() {
n := fault.New(false) n := fault.New(false)
require.NotNil(t, n) require.NotNil(t, n)
require.NoError(t, n.Err())
require.Empty(t, n.Errs())
n.Fail(assert.AnError) n.Fail(assert.AnError)
assert.Error(t, n.Err()) assert.Error(t, n.Err())