From f92f8115599f8343ace6459a5ace66b2f495358e Mon Sep 17 00:00:00 2001 From: Abhishek Pandey Date: Fri, 9 Feb 2024 21:11:05 -0800 Subject: [PATCH] Retry more errors in graph adapter wrapper (#5203) * We have started seeing `io.ErrUnexpectedEOF` and `read: connection timed out` in the last 2 days for exchange backups. Retry those. * Also increase retry count from 3 to 6. This is more of a hail mary to retry `InvalidAuthenticationToken` errors. We have observed that retries do help. But for a small set of requests, we end up exhausting retries and eventually fail with InvalidAuthenticationToken error. Hoping that bumping this to 6 will get us some relief. This fix may be removed if we find the rootcause/pattern behind this. * The event list test was taking > 150 secs. Thought I'd push this change as its a small fix. I first thought my PR https://github.com/alcionai/corso/pull/5202 broke the test. So I investigated this. --- #### Does this PR need a docs update or release note? - [ ] :white_check_mark: Yes, it's included - [ ] :clock1: Yes, but in a later PR - [x] :no_entry: No #### Type of change - [ ] :sunflower: Feature - [ ] :bug: Bugfix - [ ] :world_map: Documentation - [x] :robot: Supportability/Tests - [ ] :computer: CI/Deployment - [ ] :broom: Tech Debt/Cleanup #### Issue(s) * # #### Test Plan - [ ] :muscle: Manual - [x] :zap: Unit test - [ ] :green_heart: E2E --- .../services/m365/api/events_pager_test.go | 10 +- src/pkg/services/m365/api/graph/service.go | 21 ++- .../services/m365/api/graph/service_test.go | 121 ++++++++++++------ src/pkg/services/m365/api/helper_test.go | 12 +- 4 files changed, 116 insertions(+), 48 deletions(-) diff --git a/src/pkg/services/m365/api/events_pager_test.go b/src/pkg/services/m365/api/events_pager_test.go index 2da9140cf..7bfb8db54 100644 --- a/src/pkg/services/m365/api/events_pager_test.go +++ b/src/pkg/services/m365/api/events_pager_test.go @@ -18,6 +18,7 @@ import ( "github.com/alcionai/corso/src/internal/tester" "github.com/alcionai/corso/src/internal/tester/tconfig" "github.com/alcionai/corso/src/pkg/count" + "github.com/alcionai/corso/src/pkg/services/m365/api/graph" ) // --------------------------------------------------------------------------- @@ -107,13 +108,10 @@ func (suite *EventsPagerUnitSuite) TestEventsList() { Reply(http.StatusOK). JSON(validEventsListSingleNextLinkResponse) - // Number of retries and delay between retries is handled by a kiota - // middleware. We can change the default config parameters when setting - // up the mock in a later PR. gock.New(graphAPIHostURL). Get(nextLinkPath). AddMatcher(pageSizeMatcher(t, expectedPageSize)). - Times(4). + Times(2). // retry count is configured to 1 Reply(http.StatusServiceUnavailable). BodyString(""). Type("text/plain") @@ -241,7 +239,9 @@ func (suite *EventsPagerUnitSuite) TestEventsList() { creds, err := a.M365Config() require.NoError(t, err, clues.ToCore(err)) - client, err := gockClient(creds, count.New()) + // Run with a single retry since 503 retries are exponential and + // the test will take a long time to run. + client, err := gockClient(creds, count.New(), graph.MaxRetries(1)) require.NoError(t, err, clues.ToCore(err)) t.Cleanup(gock.Off) diff --git a/src/pkg/services/m365/api/graph/service.go b/src/pkg/services/m365/api/graph/service.go index 90656ce8e..56ef60eac 100644 --- a/src/pkg/services/m365/api/graph/service.go +++ b/src/pkg/services/m365/api/graph/service.go @@ -2,6 +2,7 @@ package graph import ( "context" + "io" "net/http" "time" @@ -173,7 +174,15 @@ func KiotaHTTPClient( const ( defaultDelay = 3 * time.Second defaultHTTPClientTimeout = 1 * time.Hour - defaultMaxRetries = 3 + + // Default retry count for retry middlewares + defaultMaxRetries = 3 + // Retry count for graph adapter + // + // Bumping retries to 6 since we have noticed that auth token expiry errors + // may continue to persist even after 3 retries. + adapterMaxRetries = 6 + // FIXME: This should ideally be 0, but if we set to 0, graph // client with automatically set the context timeout to 0 as // well which will make the client unusable. @@ -200,7 +209,7 @@ type Option func(*clientConfig) // populate constructs a clientConfig according to the provided options. func populateConfig(opts ...Option) *clientConfig { cc := clientConfig{ - maxConnectionRetries: defaultMaxRetries, + maxConnectionRetries: adapterMaxRetries, maxRetries: defaultMaxRetries, minDelay: defaultDelay, timeout: defaultHTTPClientTimeout, @@ -373,9 +382,11 @@ func wrapAdapter(gra *msgraphsdkgo.GraphRequestAdapter, cc *clientConfig) *adapt } } -var connectionEnded = filters.Contains([]string{ +// Graph may abruptly close connections, which we should retry. +var connectionEnded = filters.In([]string{ "connection reset by peer", "client connection force closed", + "read: connection timed out", }) func (aw *adapterWrap) Send( @@ -416,7 +427,9 @@ func (aw *adapterWrap) Send( err = stackWithCoreErr(ictx, err, 1) e = err - if IsErrConnectionReset(err) || connectionEnded.Compare(err.Error()) { + if IsErrConnectionReset(err) || + connectionEnded.Compare(err.Error()) || + errors.Is(err, io.ErrUnexpectedEOF) { logger.Ctx(ictx).Debug("http connection error") events.Inc(events.APICall, "connectionerror") } else if errors.Is(err, core.ErrAuthTokenExpired) { diff --git a/src/pkg/services/m365/api/graph/service_test.go b/src/pkg/services/m365/api/graph/service_test.go index 56cab3890..9cb853da4 100644 --- a/src/pkg/services/m365/api/graph/service_test.go +++ b/src/pkg/services/m365/api/graph/service_test.go @@ -93,7 +93,7 @@ func (suite *GraphIntgSuite) TestHTTPClient() { checkConfig: func(t *testing.T, c *clientConfig) { assert.Equal(t, defaultDelay, c.minDelay, "default delay") assert.Equal(t, defaultMaxRetries, c.maxRetries, "max retries") - assert.Equal(t, defaultMaxRetries, c.maxConnectionRetries, "max connection retries") + assert.Equal(t, adapterMaxRetries, c.maxConnectionRetries, "max connection retries") }, }, { @@ -221,43 +221,92 @@ func (suite *GraphIntgSuite) TestAdapterWrap_catchesPanic() { require.Contains(t, err.Error(), "panic", clues.ToCore(err)) } -func (suite *GraphIntgSuite) TestAdapterWrap_retriesConnectionClose() { - t := suite.T() - - ctx, flush := tester.NewContext(t) - defer flush() - - retryInc := 0 - - // the panics should get caught and returned as errors - alwaysECONNRESET := mwForceResp{ - err: syscall.ECONNRESET, - alternate: func(req *http.Request) (bool, *http.Response, error) { - retryInc++ - return false, nil, nil +func (suite *GraphIntgSuite) TestAdapterWrap_retriesConnectionInterruptions() { + table := []struct { + name string + providedErr error + expectRetryCount int + expectErr assert.ErrorAssertionFunc + }{ + { + name: "ECONNRESET", + providedErr: syscall.ECONNRESET, + expectRetryCount: 7, + expectErr: assert.Error, + }, + { + name: "connection reset by peer", + providedErr: clues.New("connection reset by peer what is love"), + expectRetryCount: 7, + expectErr: assert.Error, + }, + { + name: "read: connection timed out", + providedErr: clues.New("read: connection timed out baby don't hurt me"), + expectRetryCount: 7, + expectErr: assert.Error, + }, + { + name: "unexpected EOF", + providedErr: io.ErrUnexpectedEOF, + expectRetryCount: 7, + expectErr: assert.Error, + }, + { + name: "nil error", + providedErr: nil, + expectRetryCount: 1, + expectErr: assert.NoError, + }, + { + name: "non retriable error", + providedErr: clues.New("no more"), + expectRetryCount: 1, + expectErr: assert.Error, }, } - adpt, err := CreateAdapter( - suite.credentials.AzureTenantID, - suite.credentials.AzureClientID, - suite.credentials.AzureClientSecret, - count.New(), - appendMiddleware(&alwaysECONNRESET), - // Configure retry middlewares so that they don't retry on connection reset. - // Those middlewares have their own tests to verify retries. - MaxRetries(-1)) - require.NoError(t, err, clues.ToCore(err)) + for _, test := range table { + suite.Run(test.name, func() { + t := suite.T() - // Retry delay doesn't really matter here since all requests will be intercepted - // by the test middleware. Set it to 0 to reduce test runtime. - aw := adpt.(*adapterWrap) - aw.retryDelay = 0 + ctx, flush := tester.NewContext(t) + defer flush() - // the query doesn't matter - _, err = NewService(adpt).Client().Users().Get(ctx, nil) - require.ErrorIs(t, err, syscall.ECONNRESET, clues.ToCore(err)) - require.Equal(t, 4, retryInc, "number of retries") + retryInc := 0 + forceErrMW := mwForceResp{ + err: test.providedErr, + // send some dummy response, because kiota retry handler panics + // if err is nil and resp is nil. + resp: &http.Response{}, + alternate: func(req *http.Request) (bool, *http.Response, error) { + retryInc++ + return false, nil, nil + }, + } + + adpt, err := CreateAdapter( + suite.credentials.AzureTenantID, + suite.credentials.AzureClientID, + suite.credentials.AzureClientSecret, + count.New(), + appendMiddleware(&forceErrMW), + // Configure retry middlewares so that they don't retry on connection reset. + // Those middlewares have their own tests to verify retries. + MaxRetries(-1)) + require.NoError(t, err, clues.ToCore(err)) + + // Retry delay doesn't really matter here since all requests will be intercepted + // by the test middleware. Set it to 0 to reduce test runtime. + aw := adpt.(*adapterWrap) + aw.retryDelay = 0 + + // the query doesn't matter + _, err = NewService(adpt).Client().Users().Get(ctx, nil) + test.expectErr(t, err, clues.ToCore(err)) + require.Equal(t, test.expectRetryCount, retryInc, "number of retries") + }) + } } func (suite *GraphIntgSuite) TestAdapterWrap_retriesBadJWTToken() { @@ -315,14 +364,14 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesBadJWTToken() { NewItemCalendarsItemEventsDeltaRequestBuilder("https://graph.microsoft.com/fnords/beaux/regard", adpt). Get(ctx, nil) assert.ErrorIs(t, err, core.ErrAuthTokenExpired, clues.ToCore(err)) - assert.Equal(t, 4, retryInc, "number of retries") + assert.Equal(t, adapterMaxRetries+1, retryInc, "number of retries") retryInc = 0 // the query doesn't matter _, err = NewService(adpt).Client().Users().Get(ctx, nil) assert.ErrorIs(t, err, core.ErrAuthTokenExpired, clues.ToCore(err)) - assert.Equal(t, 4, retryInc, "number of retries") + assert.Equal(t, adapterMaxRetries+1, retryInc, "number of retries") } // TestAdapterWrap_retriesInvalidRequest tests adapter retries for graph 400 @@ -398,7 +447,7 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesInvalidRequest() { _, err = NewService(adpt).Client().Users().Get(ctx, nil) return err }, - expectedRetries: 4, + expectedRetries: adapterMaxRetries + 1, }, { name: "POST request, no retry", diff --git a/src/pkg/services/m365/api/helper_test.go b/src/pkg/services/m365/api/helper_test.go index cda1182f4..09eca18ae 100644 --- a/src/pkg/services/m365/api/helper_test.go +++ b/src/pkg/services/m365/api/helper_test.go @@ -23,13 +23,19 @@ import ( // GockClient produces a new exchange api client that can be // mocked using gock. -func gockClient(creds account.M365Config, counter *count.Bus) (Client, error) { - s, err := graph.NewGockService(creds, counter) +func gockClient( + creds account.M365Config, + counter *count.Bus, + opts ...graph.Option, +) (Client, error) { + s, err := graph.NewGockService(creds, counter, opts...) if err != nil { return Client{}, err } - li, err := graph.NewGockService(creds, counter, graph.NoTimeout()) + opts = append(opts, graph.NoTimeout()) + + li, err := graph.NewGockService(creds, counter, opts...) if err != nil { return Client{}, err }