Add retries for posting item attachments (#4840)
Add exponential backoff and some retries when posting item attachments. Manual testing was showing upwards of 50 retries in some cases when ~250-300 attachments total were being added. Of the retries, none were at the max amount. However, all tests were for relatively small datasets --- #### Does this PR need a docs update or release note? - [x] ✅ Yes, it's included - [ ] 🕐 Yes, but in a later PR - [ ] ⛔ No #### Type of change - [ ] 🌻 Feature - [x] 🐛 Bugfix - [ ] 🗺️ Documentation - [ ] 🤖 Supportability/Tests - [ ] 💻 CI/Deployment - [ ] 🧹 Tech Debt/Cleanup #### Test Plan - [x] 💪 Manual - [ ] ⚡ Unit test - [ ] 💚 E2E
This commit is contained in:
parent
d16528be50
commit
4b64faef70
@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
### Fixed
|
||||
- Handle the case where an email cannot be retrieved from Exchange due to an `ErrorInvalidRecipients` error. In
|
||||
this case, Corso will skip over the item but report this in the backup summary.
|
||||
- Guarantee Exchange email restoration when restoring multiple attachments. Some previous restores were failing with `ErrorItemNotFound`.
|
||||
|
||||
## [v0.17.0] (beta) - 2023-12-11
|
||||
|
||||
|
||||
@ -4,8 +4,10 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"runtime/trace"
|
||||
"time"
|
||||
|
||||
"github.com/alcionai/clues"
|
||||
"github.com/cenkalti/backoff/v4"
|
||||
"github.com/microsoftgraph/msgraph-sdk-go/models"
|
||||
|
||||
"github.com/alcionai/corso/src/internal/common/ptr"
|
||||
@ -213,6 +215,14 @@ func getOrPopulateContainer(
|
||||
return folderID, nil
|
||||
}
|
||||
|
||||
const (
|
||||
maxRetries = 3
|
||||
retryInterval = 750 * time.Millisecond
|
||||
retryMultiplier = 2
|
||||
retryMaxInterval = 5 * time.Second
|
||||
retryMaxElapsedTime = 0
|
||||
)
|
||||
|
||||
func uploadAttachments(
|
||||
ctx context.Context,
|
||||
ap attachmentPoster,
|
||||
@ -222,18 +232,57 @@ func uploadAttachments(
|
||||
) error {
|
||||
el := errs.Local()
|
||||
|
||||
for _, a := range as {
|
||||
// Manual testing showed that even small delays between retries could get us
|
||||
// past the issues. Use an exponential backoff that will allow us to keep
|
||||
// runtimes low if there's just one failure for a given attachment but still
|
||||
// waits a reasonable amount of time if there's multiple failures.
|
||||
retryBackoff := backoff.NewExponentialBackOff()
|
||||
retryBackoff.InitialInterval = retryInterval
|
||||
retryBackoff.Multiplier = retryMultiplier
|
||||
retryBackoff.MaxInterval = retryMaxInterval
|
||||
retryBackoff.MaxElapsedTime = retryMaxElapsedTime
|
||||
|
||||
for i, a := range as {
|
||||
if el.Failure() != nil {
|
||||
return el.Failure()
|
||||
}
|
||||
|
||||
err := uploadAttachment(
|
||||
ctx,
|
||||
actx := clues.Add(ctx, "attachment_num", i)
|
||||
|
||||
var (
|
||||
retry int
|
||||
err error
|
||||
)
|
||||
|
||||
retryBackoff.Reset()
|
||||
|
||||
for (retry == 0 || graph.IsErrItemNotFound(err)) && retry <= maxRetries {
|
||||
retry++
|
||||
|
||||
ictx := clues.Add(actx, "attempt_num", retry)
|
||||
|
||||
err = uploadAttachment(
|
||||
ictx,
|
||||
ap,
|
||||
resourceID,
|
||||
destinationID,
|
||||
itemID,
|
||||
a)
|
||||
|
||||
// Sometimes graph returns a 404 when we try to post the attachment.
|
||||
// We're not sure why, but maybe it has to do with attaching many items.
|
||||
// In any case, wait a little while and try again.
|
||||
if graph.IsErrItemNotFound(err) && retry <= maxRetries {
|
||||
waitTime := retryBackoff.NextBackOff()
|
||||
|
||||
logger.Ctx(ictx).Infow(
|
||||
"error uploading attachment, retrying",
|
||||
"retry_after", waitTime)
|
||||
|
||||
time.Sleep(waitTime)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
// FIXME: I don't know why we're swallowing this error case.
|
||||
// It needs investigation: https://github.com/alcionai/corso/issues/3498
|
||||
@ -247,7 +296,7 @@ func uploadAttachments(
|
||||
continue
|
||||
}
|
||||
|
||||
el.AddRecoverable(ctx, clues.WrapWC(ctx, err, "uploading mail attachment"))
|
||||
el.AddRecoverable(actx, clues.WrapWC(ctx, err, "uploading mail attachment"))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -166,7 +166,7 @@ func IsErrDeletedInFlight(err error) bool {
|
||||
}
|
||||
|
||||
func IsErrItemNotFound(err error) bool {
|
||||
return hasErrorCode(err, itemNotFound)
|
||||
return hasErrorCode(err, itemNotFound, errorItemNotFound)
|
||||
}
|
||||
|
||||
func IsErrInvalidDelta(err error) bool {
|
||||
|
||||
@ -858,10 +858,15 @@ func (suite *GraphErrorsUnitSuite) TestIsErrItemNotFound() {
|
||||
expect: assert.False,
|
||||
},
|
||||
{
|
||||
name: "item nott found oDataErr",
|
||||
name: "item not found oDataErr",
|
||||
err: graphTD.ODataErr(string(itemNotFound)),
|
||||
expect: assert.True,
|
||||
},
|
||||
{
|
||||
name: "error item not found oDataErr",
|
||||
err: graphTD.ODataErr(string(errorItemNotFound)),
|
||||
expect: assert.True,
|
||||
},
|
||||
}
|
||||
for _, test := range table {
|
||||
suite.Run(test.name, func() {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user