Add retries for posting item attachments (#4840)

Add exponential backoff and some retries when posting item attachments.

Manual testing was showing upwards of 50 retries in some cases when
~250-300 attachments total were being added. Of the retries, none
were at the max amount. However, all tests were for relatively small
datasets

---

#### Does this PR need a docs update or release note?

- [x]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

- [ ] 🌻 Feature
- [x] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Test Plan

- [x] 💪 Manual
- [ ]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-12-13 15:59:42 -08:00 committed by GitHub
parent d16528be50
commit 4b64faef70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 66 additions and 11 deletions

View File

@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed ### Fixed
- Handle the case where an email cannot be retrieved from Exchange due to an `ErrorInvalidRecipients` error. In - Handle the case where an email cannot be retrieved from Exchange due to an `ErrorInvalidRecipients` error. In
this case, Corso will skip over the item but report this in the backup summary. this case, Corso will skip over the item but report this in the backup summary.
- Guarantee Exchange email restoration when restoring multiple attachments. Some previous restores were failing with `ErrorItemNotFound`.
## [v0.17.0] (beta) - 2023-12-11 ## [v0.17.0] (beta) - 2023-12-11

View File

@ -4,8 +4,10 @@ import (
"bytes" "bytes"
"context" "context"
"runtime/trace" "runtime/trace"
"time"
"github.com/alcionai/clues" "github.com/alcionai/clues"
"github.com/cenkalti/backoff/v4"
"github.com/microsoftgraph/msgraph-sdk-go/models" "github.com/microsoftgraph/msgraph-sdk-go/models"
"github.com/alcionai/corso/src/internal/common/ptr" "github.com/alcionai/corso/src/internal/common/ptr"
@ -213,6 +215,14 @@ func getOrPopulateContainer(
return folderID, nil return folderID, nil
} }
const (
maxRetries = 3
retryInterval = 750 * time.Millisecond
retryMultiplier = 2
retryMaxInterval = 5 * time.Second
retryMaxElapsedTime = 0
)
func uploadAttachments( func uploadAttachments(
ctx context.Context, ctx context.Context,
ap attachmentPoster, ap attachmentPoster,
@ -222,18 +232,57 @@ func uploadAttachments(
) error { ) error {
el := errs.Local() el := errs.Local()
for _, a := range as { // Manual testing showed that even small delays between retries could get us
// past the issues. Use an exponential backoff that will allow us to keep
// runtimes low if there's just one failure for a given attachment but still
// waits a reasonable amount of time if there's multiple failures.
retryBackoff := backoff.NewExponentialBackOff()
retryBackoff.InitialInterval = retryInterval
retryBackoff.Multiplier = retryMultiplier
retryBackoff.MaxInterval = retryMaxInterval
retryBackoff.MaxElapsedTime = retryMaxElapsedTime
for i, a := range as {
if el.Failure() != nil { if el.Failure() != nil {
return el.Failure() return el.Failure()
} }
err := uploadAttachment( actx := clues.Add(ctx, "attachment_num", i)
ctx,
var (
retry int
err error
)
retryBackoff.Reset()
for (retry == 0 || graph.IsErrItemNotFound(err)) && retry <= maxRetries {
retry++
ictx := clues.Add(actx, "attempt_num", retry)
err = uploadAttachment(
ictx,
ap, ap,
resourceID, resourceID,
destinationID, destinationID,
itemID, itemID,
a) a)
// Sometimes graph returns a 404 when we try to post the attachment.
// We're not sure why, but maybe it has to do with attaching many items.
// In any case, wait a little while and try again.
if graph.IsErrItemNotFound(err) && retry <= maxRetries {
waitTime := retryBackoff.NextBackOff()
logger.Ctx(ictx).Infow(
"error uploading attachment, retrying",
"retry_after", waitTime)
time.Sleep(waitTime)
}
}
if err != nil { if err != nil {
// FIXME: I don't know why we're swallowing this error case. // FIXME: I don't know why we're swallowing this error case.
// It needs investigation: https://github.com/alcionai/corso/issues/3498 // It needs investigation: https://github.com/alcionai/corso/issues/3498
@ -247,7 +296,7 @@ func uploadAttachments(
continue continue
} }
el.AddRecoverable(ctx, clues.WrapWC(ctx, err, "uploading mail attachment")) el.AddRecoverable(actx, clues.WrapWC(ctx, err, "uploading mail attachment"))
} }
} }

View File

@ -166,7 +166,7 @@ func IsErrDeletedInFlight(err error) bool {
} }
func IsErrItemNotFound(err error) bool { func IsErrItemNotFound(err error) bool {
return hasErrorCode(err, itemNotFound) return hasErrorCode(err, itemNotFound, errorItemNotFound)
} }
func IsErrInvalidDelta(err error) bool { func IsErrInvalidDelta(err error) bool {

View File

@ -858,10 +858,15 @@ func (suite *GraphErrorsUnitSuite) TestIsErrItemNotFound() {
expect: assert.False, expect: assert.False,
}, },
{ {
name: "item nott found oDataErr", name: "item not found oDataErr",
err: graphTD.ODataErr(string(itemNotFound)), err: graphTD.ODataErr(string(itemNotFound)),
expect: assert.True, expect: assert.True,
}, },
{
name: "error item not found oDataErr",
err: graphTD.ODataErr(string(errorItemNotFound)),
expect: assert.True,
},
} }
for _, test := range table { for _, test := range table {
suite.Run(test.name, func() { suite.Run(test.name, func() {