Add retries for transient graph 400 "invalidRequest" errors (#4993)

<!-- PR description-->

* Adding a retry for 400 "invalidRequest" errors. Restricting it to GET requests to limit the scope of the fix.
* Retrying at adapter wrapper `Send` level instead of doing it in retry middleware. If we do it in middleware, we'll have to resort to a string search of "invalidRequest" in the http response, which is not the best way. kiota-http nicely parses the http response into an odata error after the request has finished. So doing retry after that in adapter wrapper `Send` is better as we have better guarantees on it being an odata error type. 
* Also added a small test optimization to reduce test run time.

---

#### Does this PR need a docs update or release note?

- [x]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

<!--- Please check the type of change your PR introduces: --->
- [ ] 🌻 Feature
- [x] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

<!-- Can reference multiple issues. Use one of the following "magic words" - "closes, fixes" to auto-close the Github issue. -->
* #<issue>

#### Test Plan

<!-- How will this be tested prior to merging.-->
- [x] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
Abhishek Pandey 2024-01-09 19:17:50 -08:00 committed by GitHub
parent a40807aa18
commit dbac8a9fbf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 214 additions and 4 deletions

View File

@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] (beta)
### Fixed
- Retry transient 400 "invalidRequest" errors during onedrive & sharepoint backup.
### Changed
- When running `backup details` on an empty backup returns a more helpful error message.

View File

@ -53,6 +53,7 @@ const (
// the same name as another folder in the same parent. Such duplicate folder
// names are not allowed by graph.
folderExists errorCode = "ErrorFolderExists"
invalidRequest errorCode = "invalidRequest"
// Some datacenters are returning this when we try to get the inbox of a user
// that doesn't exist.
invalidUser errorCode = "ErrorInvalidUser"
@ -169,6 +170,11 @@ func IsErrInvalidDelta(err error) bool {
return parseODataErr(err).hasErrorCode(err, syncStateNotFound, resyncRequired, syncStateInvalid)
}
func IsErrInvalidRequest(err error) bool {
return parseODataErr(err).hasResponseCode(err, http.StatusBadRequest) &&
parseODataErr(err).hasErrorCode(err, invalidRequest)
}
func IsErrDeltaNotSupported(err error) bool {
return parseODataErr(err).hasErrorMessage(err, ParameterDeltaTokenNotSupported)
}
@ -557,6 +563,12 @@ func parseODataErr(err error) oDataErr {
return ode
}
// hasResponseCode checks if the error is an ODataError and carries the provided
// http response code.
func (ode oDataErr) hasResponseCode(err error, httpResponseCode int) bool {
return ode.isODataErr && ode.Resp.StatusCode == httpResponseCode
}
func (ode oDataErr) hasErrorCode(err error, codes ...errorCode) bool {
if !ode.isODataErr {
return false

View File

@ -335,6 +335,50 @@ func (suite *GraphErrorsUnitSuite) TestIsErrDeltaNotSupported() {
}
}
func (suite *GraphErrorsUnitSuite) TestIsErrInvalidRequest() {
table := []struct {
name string
err error
expect assert.BoolAssertionFunc
}{
{
name: "nil",
err: nil,
expect: assert.False,
},
{
name: "non-matching",
err: assert.AnError,
expect: assert.False,
},
{
name: "non-matching oDataErr",
err: graphTD.ODataErr("fnords"),
expect: assert.False,
},
{
name: "non-matching oDataErrMsg",
err: graphTD.ODataErrWithMsg("fnords", "not supported"),
expect: assert.False,
},
{
name: "invalidRequest oDataErr, missing 400 status code",
err: graphTD.ODataErr(string(invalidRequest)),
expect: assert.False,
},
{
name: "invalidRequest oDataErr, with 400 status code",
err: graphTD.ODataErrWithAPIResponse(string(invalidRequest), http.StatusBadRequest),
expect: assert.True,
},
}
for _, test := range table {
suite.Run(test.name, func() {
test.expect(suite.T(), IsErrInvalidRequest(test.err))
})
}
}
func (suite *GraphErrorsUnitSuite) TestIsErrQuotaExceeded() {
table := []struct {
name string

View File

@ -345,6 +345,11 @@ func kiotaMiddlewares(
var _ abstractions.RequestAdapter = &adapterWrap{}
const (
// Delay between retry attempts
adapterRetryDelay = 3 * time.Second
)
// adapterWrap takes a GraphRequestAdapter and replaces the Send() function to
// act as a middleware for all http calls. Certain error conditions never reach
// the the client middleware layer, and therefore miss out on logging and retries.
@ -355,10 +360,15 @@ var _ abstractions.RequestAdapter = &adapterWrap{}
type adapterWrap struct {
abstractions.RequestAdapter
config *clientConfig
retryDelay time.Duration
}
func wrapAdapter(gra *msgraphsdkgo.GraphRequestAdapter, cc *clientConfig) *adapterWrap {
return &adapterWrap{gra, cc}
return &adapterWrap{
RequestAdapter: gra,
config: cc,
retryDelay: adapterRetryDelay,
}
}
var connectionEnded = filters.Contains([]string{
@ -407,11 +417,18 @@ func (aw *adapterWrap) Send(
case IsErrBadJWTToken(err):
logger.Ctx(ictx).Debug("bad jwt token")
events.Inc(events.APICall, "badjwttoken")
case requestInfo.Method.String() == http.MethodGet && IsErrInvalidRequest(err):
// Graph may sometimes return a transient 400 response during onedrive
// and sharepoint backup. This is pending investigation on msft end, retry
// for now as it's a transient issue. Restrict retries to GET requests only
// to limit the scope of this fix.
logger.Ctx(ictx).Debug("invalid request")
events.Inc(events.APICall, "invalidrequest")
default:
return nil, clues.StackWC(ictx, err).WithTrace(1)
}
time.Sleep(3 * time.Second)
time.Sleep(aw.retryDelay)
}
return sp, clues.Stack(err).OrNil()

View File

@ -1,6 +1,10 @@
package graph
import (
"bytes"
"context"
"encoding/json"
"io"
"net/http"
"strconv"
"syscall"
@ -67,6 +71,9 @@ func (suite *GraphIntgSuite) TestCreateAdapter() {
assert.NoError(t, err, clues.ToCore(err))
assert.NotNil(t, adpt)
aw := adpt.(*adapterWrap)
assert.Equal(t, adapterRetryDelay, aw.retryDelay, "default retry delay")
}
func (suite *GraphIntgSuite) TestHTTPClient() {
@ -198,6 +205,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_catchesPanic() {
appendMiddleware(&alwaysPanicMiddleware))
require.NoError(t, err, clues.ToCore(err))
// Set retry delay to a low value to reduce test runtime.
aw := adpt.(*adapterWrap)
aw.retryDelay = 10 * time.Millisecond
// the query doesn't matter
_, err = users.NewItemCalendarsItemEventsDeltaRequestBuilder(url, adpt).Get(ctx, nil)
require.Error(t, err, clues.ToCore(err))
@ -235,6 +246,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesConnectionClose() {
appendMiddleware(&alwaysECONNRESET))
require.NoError(t, err, clues.ToCore(err))
// Set retry delay to a low value to reduce test runtime.
aw := adpt.(*adapterWrap)
aw.retryDelay = 10 * time.Millisecond
// the query doesn't matter
_, err = users.NewItemCalendarsItemEventsDeltaRequestBuilder(url, adpt).Get(ctx, nil)
require.ErrorIs(t, err, syscall.ECONNRESET, clues.ToCore(err))
@ -292,6 +307,10 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesBadJWTToken() {
appendMiddleware(&alwaysBadJWT))
require.NoError(t, err, clues.ToCore(err))
// Set retry delay to a low value to reduce test runtime.
aw := adpt.(*adapterWrap)
aw.retryDelay = 10 * time.Millisecond
// When run locally this may fail. Not sure why it works in github but not locally.
// Pester keepers if it bothers you.
_, err = users.
@ -307,3 +326,106 @@ func (suite *GraphIntgSuite) TestAdapterWrap_retriesBadJWTToken() {
assert.True(t, IsErrBadJWTToken(err), clues.ToCore(err))
assert.Equal(t, 4, retryInc, "number of retries")
}
// TestAdapterWrap_retriesInvalidRequest tests adapter retries for graph 400
// invalidRequest errors. It also tests that retries are only done for GET
// requests.
func (suite *GraphIntgSuite) TestAdapterWrap_retriesInvalidRequest() {
var (
t = suite.T()
retryInc = 0
// Formulate a graph error response which is parseable to odata error.
graphResp = map[string]any{
"error": map[string]any{
"code": invalidRequest,
"message": "Invalid request",
"innerError": map[string]any{
"date": "2024-01-01T18:00:00",
"request-id": "rid",
"client-request-id": "cid",
},
},
}
)
serialized, err := json.Marshal(graphResp)
require.NoError(t, err, clues.ToCore(err))
l := int64(len(serialized))
// Set up a test middleware to always return a graph 400 invalidRequest
// response.
returnsGraphResp := mwForceResp{
alternate: func(req *http.Request) (bool, *http.Response, error) {
retryInc++
header := http.Header{}
header.Set("Content-Length", strconv.Itoa(int(l)))
header.Set("Content-Type", "application/json")
resp := &http.Response{
Body: io.NopCloser(bytes.NewReader(serialized)),
ContentLength: l,
Header: header,
Proto: req.Proto,
Request: req,
StatusCode: http.StatusBadRequest, // Required retry condition
}
return true, resp, nil
},
}
adpt, err := CreateAdapter(
suite.credentials.AzureTenantID,
suite.credentials.AzureClientID,
suite.credentials.AzureClientSecret,
count.New(),
appendMiddleware(&returnsGraphResp))
require.NoError(t, err, clues.ToCore(err))
// Set retry delay to a low value to reduce test runtime.
aw := adpt.(*adapterWrap)
aw.retryDelay = 10 * time.Millisecond
table := []struct {
name string
apiRequest func(t *testing.T, ctx context.Context) error
expectedRetries int
}{
{
name: "GET request, retried",
apiRequest: func(t *testing.T, ctx context.Context) error {
_, err = NewService(adpt).Client().Users().Get(ctx, nil)
return err
},
expectedRetries: 4,
},
{
name: "POST request, no retry",
apiRequest: func(t *testing.T, ctx context.Context) error {
u := models.NewUser()
cfg := users.UsersRequestBuilderPostRequestConfiguration{}
_, err = NewService(adpt).Client().Users().Post(ctx, u, &cfg)
return err
},
expectedRetries: 1,
},
}
for _, test := range table {
suite.Run(test.name, func() {
tt := suite.T()
retryInc = 0
ctx, flush := tester.NewContext(tt)
defer flush()
err := test.apiRequest(tt, ctx)
assert.True(tt, IsErrInvalidRequest(err), clues.ToCore(err))
assert.Equal(tt, test.expectedRetries, retryInc, "number of retries")
})
}
}

View File

@ -57,6 +57,18 @@ func ODataInner(innerCode string) *odataerrors.ODataError {
return odErr
}
func ODataErrWithAPIResponse(
code string,
respCode int,
) *odataerrors.ODataError {
odErr := ODataErr(code)
// TODO(pandeyabs): Expand this function to set response headers as well.
odErr.SetStatusCode(respCode)
return odErr
}
func ParseableToMap(t *testing.T, thing serialization.Parsable) map[string]any {
sw := kjson.NewJsonSerializationWriter()