Add retries for transient graph 400 "invalidRequest" errors (#4993)

<!-- PR description-->

* Adding a retry for 400 "invalidRequest" errors. Restricting it to GET requests to limit the scope of the fix.
* Retrying at adapter wrapper `Send` level instead of doing it in retry middleware. If we do it in middleware, we'll have to resort to a string search of "invalidRequest" in the http response, which is not the best way. kiota-http nicely parses the http response into an odata error after the request has finished. So doing retry after that in adapter wrapper `Send` is better as we have better guarantees on it being an odata error type. 
* Also added a small test optimization to reduce test run time.

---

#### Does this PR need a docs update or release note?

- [x]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

<!--- Please check the type of change your PR introduces: --->
- [ ] 🌻 Feature
- [x] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

<!-- Can reference multiple issues. Use one of the following "magic words" - "closes, fixes" to auto-close the Github issue. -->
* #<issue>

#### Test Plan

<!-- How will this be tested prior to merging.-->
- [x] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
Abhishek Pandey 2024-01-09 19:17:50 -08:00 committed by GitHub
parent a40807aa18
commit dbac8a9fbf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 214 additions and 4 deletions

View File

@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] (beta) ## [Unreleased] (beta)
### Fixed
- Retry transient 400 "invalidRequest" errors during onedrive & sharepoint backup.
### Changed ### Changed
- When running `backup details` on an empty backup returns a more helpful error message. - When running `backup details` on an empty backup returns a more helpful error message.

View File

@ -52,7 +52,8 @@ const (
// This error occurs when an attempt is made to create a folder that has // This error occurs when an attempt is made to create a folder that has
// the same name as another folder in the same parent. Such duplicate folder // the same name as another folder in the same parent. Such duplicate folder
// names are not allowed by graph. // names are not allowed by graph.
folderExists errorCode = "ErrorFolderExists" folderExists errorCode = "ErrorFolderExists"
invalidRequest errorCode = "invalidRequest"
// Some datacenters are returning this when we try to get the inbox of a user // Some datacenters are returning this when we try to get the inbox of a user
// that doesn't exist. // that doesn't exist.
invalidUser errorCode = "ErrorInvalidUser" invalidUser errorCode = "ErrorInvalidUser"
@ -169,6 +170,11 @@ func IsErrInvalidDelta(err error) bool {
return parseODataErr(err).hasErrorCode(err, syncStateNotFound, resyncRequired, syncStateInvalid) return parseODataErr(err).hasErrorCode(err, syncStateNotFound, resyncRequired, syncStateInvalid)
} }
func IsErrInvalidRequest(err error) bool {
return parseODataErr(err).hasResponseCode(err, http.StatusBadRequest) &&
parseODataErr(err).hasErrorCode(err, invalidRequest)
}
func IsErrDeltaNotSupported(err error) bool { func IsErrDeltaNotSupported(err error) bool {
return parseODataErr(err).hasErrorMessage(err, ParameterDeltaTokenNotSupported) return parseODataErr(err).hasErrorMessage(err, ParameterDeltaTokenNotSupported)
} }
@ -557,6 +563,12 @@ func parseODataErr(err error) oDataErr {
return ode return ode
} }
// hasResponseCode checks if the error is an ODataError and carries the provided
// http response code.
func (ode oDataErr) hasResponseCode(err error, httpResponseCode int) bool {
return ode.isODataErr && ode.Resp.StatusCode == httpResponseCode
}
func (ode oDataErr) hasErrorCode(err error, codes ...errorCode) bool { func (ode oDataErr) hasErrorCode(err error, codes ...errorCode) bool {
if !ode.isODataErr { if !ode.isODataErr {
return false return false

View File

@ -335,6 +335,50 @@ func (suite *GraphErrorsUnitSuite) TestIsErrDeltaNotSupported() {
} }
} }
func (suite *GraphErrorsUnitSuite) TestIsErrInvalidRequest() {
table := []struct {
name string
err error
expect assert.BoolAssertionFunc
}{
{
name: "nil",
err: nil,
expect: assert.False,
},
{
name: "non-matching",
err: assert.AnError,
expect: assert.False,
},
{
name: "non-matching oDataErr",
err: graphTD.ODataErr("fnords"),
expect: assert.False,
},
{
name: "non-matching oDataErrMsg",
err: graphTD.ODataErrWithMsg("fnords", "not supported"),
expect: assert.False,
},
{
name: "invalidRequest oDataErr, missing 400 status code",
err: graphTD.ODataErr(string(invalidRequest)),
expect: assert.False,
},
{
name: "invalidRequest oDataErr, with 400 status code",
err: graphTD.ODataErrWithAPIResponse(string(invalidRequest), http.StatusBadRequest),
expect: assert.True,
},
}
for _, test := range table {
suite.Run(test.name, func() {
test.expect(suite.T(), IsErrInvalidRequest(test.err))
})
}
}
func (suite *GraphErrorsUnitSuite) TestIsErrQuotaExceeded() { func (suite *GraphErrorsUnitSuite) TestIsErrQuotaExceeded() {
table := []struct { table := []struct {
name string name string

View File

@ -345,6 +345,11 @@ func kiotaMiddlewares(
var _ abstractions.RequestAdapter = &adapterWrap{} var _ abstractions.RequestAdapter = &adapterWrap{}
const (
// Delay between retry attempts
adapterRetryDelay = 3 * time.Second
)
// adapterWrap takes a GraphRequestAdapter and replaces the Send() function to // adapterWrap takes a GraphRequestAdapter and replaces the Send() function to
// act as a middleware for all http calls. Certain error conditions never reach // act as a middleware for all http calls. Certain error conditions never reach
// the the client middleware layer, and therefore miss out on logging and retries. // the the client middleware layer, and therefore miss out on logging and retries.
@ -354,11 +359,16 @@ var _ abstractions.RequestAdapter = &adapterWrap{}
// 3. Error and debug conditions are logged. // 3. Error and debug conditions are logged.
type adapterWrap struct { type adapterWrap struct {
abstractions.RequestAdapter abstractions.RequestAdapter
config *clientConfig config *clientConfig
retryDelay time.Duration
} }
func wrapAdapter(gra *msgraphsdkgo.GraphRequestAdapter, cc *clientConfig) *adapterWrap { func wrapAdapter(gra *msgraphsdkgo.GraphRequestAdapter, cc *clientConfig) *adapterWrap {
return &adapterWrap{gra, cc} return &adapterWrap{
RequestAdapter: gra,
config: cc,
retryDelay: adapterRetryDelay,
}
} }
var connectionEnded = filters.Contains([]string{ var connectionEnded = filters.Contains([]string{
@ -407,11 +417,18 @@ func (aw *adapterWrap) Send(
case IsErrBadJWTToken(err): case IsErrBadJWTToken(err):
logger.Ctx(ictx).Debug("bad jwt token") logger.Ctx(ictx).Debug("bad jwt token")
events.Inc(events.APICall, "badjwttoken") events.Inc(events.APICall, "badjwttoken")
case requestInfo.Method.String() == http.MethodGet && IsErrInvalidRequest(err):
// Graph may sometimes return a transient 400 response during onedrive
// and sharepoint backup. This is pending investigation on msft end, retry
// for now as it's a transient issue. Restrict retries to GET requests only
// to limit the scope of this fix.
logger.Ctx(ictx).Debug("invalid request")
events.Inc(events.APICall, "invalidrequest")
default: default:
return nil, clues.StackWC(ictx, err).WithTrace(1) return nil, clues.StackWC(ictx, err).WithTrace(1)
} }
time.Sleep(3 * time.Second) time.Sleep(aw.retryDelay)
} }
return sp, clues.Stack(err).OrNil() return sp, clues.Stack(err).OrNil()

View File

@ -1,6 +1,10 @@
package graph package graph
import ( import (
"bytes"
"context"
"encoding/json"
"io"
"net/http" "net/http"
"strconv" "strconv"
"syscall" "syscall"
@ -67,6 +71,9 @@ func (suite *GraphIntgSuite) TestCreateAdapter() {
assert.NoError(t, err, clues.ToCore(err)) assert.NoError(t, err, clues.ToCore(err))
assert.NotNil(t, adpt) assert.NotNil(t, adpt)
aw := adpt.(*adapterWrap)
assert.Equal(t, adapterRetryDelay, aw.retryDelay, "default retry delay")
} }
func (suite *GraphIntgSuite) TestHTTPClient() { func (suite *GraphIntgSuite) TestHTTPClient() {
@ -198,6 +205,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_catchesPanic() {
appendMiddleware(&alwaysPanicMiddleware)) appendMiddleware(&alwaysPanicMiddleware))
require.NoError(t, err, clues.ToCore(err)) require.NoError(t, err, clues.ToCore(err))
// Set retry delay to a low value to reduce test runtime.
aw := adpt.(*adapterWrap)
aw.retryDelay = 10 * time.Millisecond
// the query doesn't matter // the query doesn't matter
_, err = users.NewItemCalendarsItemEventsDeltaRequestBuilder(url, adpt).Get(ctx, nil) _, err = users.NewItemCalendarsItemEventsDeltaRequestBuilder(url, adpt).Get(ctx, nil)
require.Error(t, err, clues.ToCore(err)) require.Error(t, err, clues.ToCore(err))
@ -235,6 +246,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesConnectionClose() {
appendMiddleware(&alwaysECONNRESET)) appendMiddleware(&alwaysECONNRESET))
require.NoError(t, err, clues.ToCore(err)) require.NoError(t, err, clues.ToCore(err))
// Set retry delay to a low value to reduce test runtime.
aw := adpt.(*adapterWrap)
aw.retryDelay = 10 * time.Millisecond
// the query doesn't matter // the query doesn't matter
_, err = users.NewItemCalendarsItemEventsDeltaRequestBuilder(url, adpt).Get(ctx, nil) _, err = users.NewItemCalendarsItemEventsDeltaRequestBuilder(url, adpt).Get(ctx, nil)
require.ErrorIs(t, err, syscall.ECONNRESET, clues.ToCore(err)) require.ErrorIs(t, err, syscall.ECONNRESET, clues.ToCore(err))
@ -292,6 +307,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesBadJWTToken() {
appendMiddleware(&alwaysBadJWT)) appendMiddleware(&alwaysBadJWT))
require.NoError(t, err, clues.ToCore(err)) require.NoError(t, err, clues.ToCore(err))
// Set retry delay to a low value to reduce test runtime.
aw := adpt.(*adapterWrap)
aw.retryDelay = 10 * time.Millisecond
// When run locally this may fail. Not sure why it works in github but not locally. // When run locally this may fail. Not sure why it works in github but not locally.
// Pester keepers if it bothers you. // Pester keepers if it bothers you.
_, err = users. _, err = users.
@ -307,3 +326,106 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesBadJWTToken() {
assert.True(t, IsErrBadJWTToken(err), clues.ToCore(err)) assert.True(t, IsErrBadJWTToken(err), clues.ToCore(err))
assert.Equal(t, 4, retryInc, "number of retries") assert.Equal(t, 4, retryInc, "number of retries")
} }
// TestAdapterWrap_retriesInvalidRequest tests adapter retries for graph 400
// invalidRequest errors. It also tests that retries are only done for GET
// requests.
func (suite *GraphIntgSuite) TestAdapterWrap_retriesInvalidRequest() {
var (
t = suite.T()
retryInc = 0
// Formulate a graph error response which is parseable to odata error.
graphResp = map[string]any{
"error": map[string]any{
"code": invalidRequest,
"message": "Invalid request",
"innerError": map[string]any{
"date": "2024-01-01T18:00:00",
"request-id": "rid",
"client-request-id": "cid",
},
},
}
)
serialized, err := json.Marshal(graphResp)
require.NoError(t, err, clues.ToCore(err))
l := int64(len(serialized))
// Set up a test middleware to always return a graph 400 invalidRequest
// response.
returnsGraphResp := mwForceResp{
alternate: func(req *http.Request) (bool, *http.Response, error) {
retryInc++
header := http.Header{}
header.Set("Content-Length", strconv.Itoa(int(l)))
header.Set("Content-Type", "application/json")
resp := &http.Response{
Body: io.NopCloser(bytes.NewReader(serialized)),
ContentLength: l,
Header: header,
Proto: req.Proto,
Request: req,
StatusCode: http.StatusBadRequest, // Required retry condition
}
return true, resp, nil
},
}
adpt, err := CreateAdapter(
suite.credentials.AzureTenantID,
suite.credentials.AzureClientID,
suite.credentials.AzureClientSecret,
count.New(),
appendMiddleware(&returnsGraphResp))
require.NoError(t, err, clues.ToCore(err))
// Set retry delay to a low value to reduce test runtime.
aw := adpt.(*adapterWrap)
aw.retryDelay = 10 * time.Millisecond
table := []struct {
name string
apiRequest func(t *testing.T, ctx context.Context) error
expectedRetries int
}{
{
name: "GET request, retried",
apiRequest: func(t *testing.T, ctx context.Context) error {
_, err = NewService(adpt).Client().Users().Get(ctx, nil)
return err
},
expectedRetries: 4,
},
{
name: "POST request, no retry",
apiRequest: func(t *testing.T, ctx context.Context) error {
u := models.NewUser()
cfg := users.UsersRequestBuilderPostRequestConfiguration{}
_, err = NewService(adpt).Client().Users().Post(ctx, u, &cfg)
return err
},
expectedRetries: 1,
},
}
for _, test := range table {
suite.Run(test.name, func() {
tt := suite.T()
retryInc = 0
ctx, flush := tester.NewContext(tt)
defer flush()
err := test.apiRequest(tt, ctx)
assert.True(tt, IsErrInvalidRequest(err), clues.ToCore(err))
assert.Equal(tt, test.expectedRetries, retryInc, "number of retries")
})
}
}

View File

@ -57,6 +57,18 @@ func ODataInner(innerCode string) *odataerrors.ODataError {
return odErr return odErr
} }
func ODataErrWithAPIResponse(
code string,
respCode int,
) *odataerrors.ODataError {
odErr := ODataErr(code)
// TODO(pandeyabs): Expand this function to set response headers as well.
odErr.SetStatusCode(respCode)
return odErr
}
func ParseableToMap(t *testing.T, thing serialization.Parsable) map[string]any { func ParseableToMap(t *testing.T, thing serialization.Parsable) map[string]any {
sw := kjson.NewJsonSerializationWriter() sw := kjson.NewJsonSerializationWriter()