diff --git a/CHANGELOG.md b/CHANGELOG.md index d4cb9534d..6d60c0e01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] (beta) +### Fixed +- Retry transient 400 "invalidRequest" errors during onedrive & sharepoint backup. + ### Changed - When running `backup details` on an empty backup returns a more helpful error message. diff --git a/src/pkg/services/m365/api/graph/errors.go b/src/pkg/services/m365/api/graph/errors.go index 76ed542a4..140c09617 100644 --- a/src/pkg/services/m365/api/graph/errors.go +++ b/src/pkg/services/m365/api/graph/errors.go @@ -52,7 +52,8 @@ const ( // This error occurs when an attempt is made to create a folder that has // the same name as another folder in the same parent. Such duplicate folder // names are not allowed by graph. - folderExists errorCode = "ErrorFolderExists" + folderExists errorCode = "ErrorFolderExists" + invalidRequest errorCode = "invalidRequest" // Some datacenters are returning this when we try to get the inbox of a user // that doesn't exist. invalidUser errorCode = "ErrorInvalidUser" @@ -169,6 +170,11 @@ func IsErrInvalidDelta(err error) bool { return parseODataErr(err).hasErrorCode(err, syncStateNotFound, resyncRequired, syncStateInvalid) } +func IsErrInvalidRequest(err error) bool { + return parseODataErr(err).hasResponseCode(err, http.StatusBadRequest) && + parseODataErr(err).hasErrorCode(err, invalidRequest) +} + func IsErrDeltaNotSupported(err error) bool { return parseODataErr(err).hasErrorMessage(err, ParameterDeltaTokenNotSupported) } @@ -557,6 +563,12 @@ func parseODataErr(err error) oDataErr { return ode } +// hasResponseCode checks if the error is an ODataError and carries the provided +// http response code. +func (ode oDataErr) hasResponseCode(err error, httpResponseCode int) bool { + return ode.isODataErr && ode.Resp.StatusCode == httpResponseCode +} + func (ode oDataErr) hasErrorCode(err error, codes ...errorCode) bool { if !ode.isODataErr { return false diff --git a/src/pkg/services/m365/api/graph/errors_test.go b/src/pkg/services/m365/api/graph/errors_test.go index 129bffb2e..23dd4ddb7 100644 --- a/src/pkg/services/m365/api/graph/errors_test.go +++ b/src/pkg/services/m365/api/graph/errors_test.go @@ -335,6 +335,50 @@ func (suite *GraphErrorsUnitSuite) TestIsErrDeltaNotSupported() { } } +func (suite *GraphErrorsUnitSuite) TestIsErrInvalidRequest() { + table := []struct { + name string + err error + expect assert.BoolAssertionFunc + }{ + { + name: "nil", + err: nil, + expect: assert.False, + }, + { + name: "non-matching", + err: assert.AnError, + expect: assert.False, + }, + { + name: "non-matching oDataErr", + err: graphTD.ODataErr("fnords"), + expect: assert.False, + }, + { + name: "non-matching oDataErrMsg", + err: graphTD.ODataErrWithMsg("fnords", "not supported"), + expect: assert.False, + }, + { + name: "invalidRequest oDataErr, missing 400 status code", + err: graphTD.ODataErr(string(invalidRequest)), + expect: assert.False, + }, + { + name: "invalidRequest oDataErr, with 400 status code", + err: graphTD.ODataErrWithAPIResponse(string(invalidRequest), http.StatusBadRequest), + expect: assert.True, + }, + } + for _, test := range table { + suite.Run(test.name, func() { + test.expect(suite.T(), IsErrInvalidRequest(test.err)) + }) + } +} + func (suite *GraphErrorsUnitSuite) TestIsErrQuotaExceeded() { table := []struct { name string diff --git a/src/pkg/services/m365/api/graph/service.go b/src/pkg/services/m365/api/graph/service.go index 685af34ec..ca1e6f680 100644 --- a/src/pkg/services/m365/api/graph/service.go +++ b/src/pkg/services/m365/api/graph/service.go @@ -345,6 +345,11 @@ func kiotaMiddlewares( var _ abstractions.RequestAdapter = &adapterWrap{} +const ( + // Delay between retry attempts + adapterRetryDelay = 3 * time.Second +) + // adapterWrap takes a GraphRequestAdapter and replaces the Send() function to // act as a middleware for all http calls. Certain error conditions never reach // the the client middleware layer, and therefore miss out on logging and retries. @@ -354,11 +359,16 @@ var _ abstractions.RequestAdapter = &adapterWrap{} // 3. Error and debug conditions are logged. type adapterWrap struct { abstractions.RequestAdapter - config *clientConfig + config *clientConfig + retryDelay time.Duration } func wrapAdapter(gra *msgraphsdkgo.GraphRequestAdapter, cc *clientConfig) *adapterWrap { - return &adapterWrap{gra, cc} + return &adapterWrap{ + RequestAdapter: gra, + config: cc, + retryDelay: adapterRetryDelay, + } } var connectionEnded = filters.Contains([]string{ @@ -407,11 +417,18 @@ func (aw *adapterWrap) Send( case IsErrBadJWTToken(err): logger.Ctx(ictx).Debug("bad jwt token") events.Inc(events.APICall, "badjwttoken") + case requestInfo.Method.String() == http.MethodGet && IsErrInvalidRequest(err): + // Graph may sometimes return a transient 400 response during onedrive + // and sharepoint backup. This is pending investigation on msft end, retry + // for now as it's a transient issue. Restrict retries to GET requests only + // to limit the scope of this fix. + logger.Ctx(ictx).Debug("invalid request") + events.Inc(events.APICall, "invalidrequest") default: return nil, clues.StackWC(ictx, err).WithTrace(1) } - time.Sleep(3 * time.Second) + time.Sleep(aw.retryDelay) } return sp, clues.Stack(err).OrNil() diff --git a/src/pkg/services/m365/api/graph/service_test.go b/src/pkg/services/m365/api/graph/service_test.go index 2fd27a46d..ba7bd56b0 100644 --- a/src/pkg/services/m365/api/graph/service_test.go +++ b/src/pkg/services/m365/api/graph/service_test.go @@ -1,6 +1,10 @@ package graph import ( + "bytes" + "context" + "encoding/json" + "io" "net/http" "strconv" "syscall" @@ -67,6 +71,9 @@ func (suite *GraphIntgSuite) TestCreateAdapter() { assert.NoError(t, err, clues.ToCore(err)) assert.NotNil(t, adpt) + + aw := adpt.(*adapterWrap) + assert.Equal(t, adapterRetryDelay, aw.retryDelay, "default retry delay") } func (suite *GraphIntgSuite) TestHTTPClient() { @@ -198,6 +205,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_catchesPanic() { appendMiddleware(&alwaysPanicMiddleware)) require.NoError(t, err, clues.ToCore(err)) + // Set retry delay to a low value to reduce test runtime. + aw := adpt.(*adapterWrap) + aw.retryDelay = 10 * time.Millisecond + // the query doesn't matter _, err = users.NewItemCalendarsItemEventsDeltaRequestBuilder(url, adpt).Get(ctx, nil) require.Error(t, err, clues.ToCore(err)) @@ -235,6 +246,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesConnectionClose() { appendMiddleware(&alwaysECONNRESET)) require.NoError(t, err, clues.ToCore(err)) + // Set retry delay to a low value to reduce test runtime. + aw := adpt.(*adapterWrap) + aw.retryDelay = 10 * time.Millisecond + // the query doesn't matter _, err = users.NewItemCalendarsItemEventsDeltaRequestBuilder(url, adpt).Get(ctx, nil) require.ErrorIs(t, err, syscall.ECONNRESET, clues.ToCore(err)) @@ -292,6 +307,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesBadJWTToken() { appendMiddleware(&alwaysBadJWT)) require.NoError(t, err, clues.ToCore(err)) + // Set retry delay to a low value to reduce test runtime. + aw := adpt.(*adapterWrap) + aw.retryDelay = 10 * time.Millisecond + // When run locally this may fail. Not sure why it works in github but not locally. // Pester keepers if it bothers you. _, err = users. @@ -307,3 +326,106 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesBadJWTToken() { assert.True(t, IsErrBadJWTToken(err), clues.ToCore(err)) assert.Equal(t, 4, retryInc, "number of retries") } + +// TestAdapterWrap_retriesInvalidRequest tests adapter retries for graph 400 +// invalidRequest errors. It also tests that retries are only done for GET +// requests. +func (suite *GraphIntgSuite) TestAdapterWrap_retriesInvalidRequest() { + var ( + t = suite.T() + retryInc = 0 + // Formulate a graph error response which is parseable to odata error. + graphResp = map[string]any{ + "error": map[string]any{ + "code": invalidRequest, + "message": "Invalid request", + "innerError": map[string]any{ + "date": "2024-01-01T18:00:00", + "request-id": "rid", + "client-request-id": "cid", + }, + }, + } + ) + + serialized, err := json.Marshal(graphResp) + require.NoError(t, err, clues.ToCore(err)) + + l := int64(len(serialized)) + + // Set up a test middleware to always return a graph 400 invalidRequest + // response. + returnsGraphResp := mwForceResp{ + alternate: func(req *http.Request) (bool, *http.Response, error) { + retryInc++ + + header := http.Header{} + header.Set("Content-Length", strconv.Itoa(int(l))) + header.Set("Content-Type", "application/json") + + resp := &http.Response{ + Body: io.NopCloser(bytes.NewReader(serialized)), + ContentLength: l, + Header: header, + Proto: req.Proto, + Request: req, + StatusCode: http.StatusBadRequest, // Required retry condition + } + + return true, resp, nil + }, + } + + adpt, err := CreateAdapter( + suite.credentials.AzureTenantID, + suite.credentials.AzureClientID, + suite.credentials.AzureClientSecret, + count.New(), + appendMiddleware(&returnsGraphResp)) + require.NoError(t, err, clues.ToCore(err)) + + // Set retry delay to a low value to reduce test runtime. + aw := adpt.(*adapterWrap) + aw.retryDelay = 10 * time.Millisecond + + table := []struct { + name string + apiRequest func(t *testing.T, ctx context.Context) error + expectedRetries int + }{ + { + name: "GET request, retried", + apiRequest: func(t *testing.T, ctx context.Context) error { + _, err = NewService(adpt).Client().Users().Get(ctx, nil) + return err + }, + expectedRetries: 4, + }, + { + name: "POST request, no retry", + apiRequest: func(t *testing.T, ctx context.Context) error { + u := models.NewUser() + + cfg := users.UsersRequestBuilderPostRequestConfiguration{} + + _, err = NewService(adpt).Client().Users().Post(ctx, u, &cfg) + return err + }, + expectedRetries: 1, + }, + } + + for _, test := range table { + suite.Run(test.name, func() { + tt := suite.T() + retryInc = 0 + + ctx, flush := tester.NewContext(tt) + defer flush() + + err := test.apiRequest(tt, ctx) + assert.True(tt, IsErrInvalidRequest(err), clues.ToCore(err)) + assert.Equal(tt, test.expectedRetries, retryInc, "number of retries") + }) + } +} diff --git a/src/pkg/services/m365/api/graph/testdata/errors.go b/src/pkg/services/m365/api/graph/testdata/errors.go index 62bfb04e6..8cdef9826 100644 --- a/src/pkg/services/m365/api/graph/testdata/errors.go +++ b/src/pkg/services/m365/api/graph/testdata/errors.go @@ -57,6 +57,18 @@ func ODataInner(innerCode string) *odataerrors.ODataError { return odErr } +func ODataErrWithAPIResponse( + code string, + respCode int, +) *odataerrors.ODataError { + odErr := ODataErr(code) + + // TODO(pandeyabs): Expand this function to set response headers as well. + odErr.SetStatusCode(respCode) + + return odErr +} + func ParseableToMap(t *testing.T, thing serialization.Parsable) map[string]any { sw := kjson.NewJsonSerializationWriter()