From 28aba60cc5c326b674fd36f78609a13c54892a08 Mon Sep 17 00:00:00 2001 From: ashmrtn <3891298+ashmrtn@users.noreply.github.com> Date: Mon, 12 Feb 2024 18:26:04 -0800 Subject: [PATCH] Check for and retry 404s with no content (#5217) We've started seeing 404 errors with no content being returned. Check for these in the http wrapper we use and retry them. While graph SDK returns an error for this sort of situation it's a very basic error since it normally expects to parse info out of the response body. Therefore it should be safe to inject our own error that we can check for. --- #### Does this PR need a docs update or release note? - [ ] :white_check_mark: Yes, it's included - [ ] :clock1: Yes, but in a later PR - [x] :no_entry: No #### Type of change - [ ] :sunflower: Feature - [x] :bug: Bugfix - [ ] :world_map: Documentation - [ ] :robot: Supportability/Tests - [ ] :computer: CI/Deployment - [ ] :broom: Tech Debt/Cleanup #### Test Plan - [ ] :muscle: Manual - [x] :zap: Unit test - [ ] :green_heart: E2E --- src/pkg/services/m365/api/graph/errors.go | 20 +++- src/pkg/services/m365/api/graph/service.go | 7 ++ .../services/m365/api/graph/service_test.go | 110 ++++++++++++++++++ 3 files changed, 133 insertions(+), 4 deletions(-) diff --git a/src/pkg/services/m365/api/graph/errors.go b/src/pkg/services/m365/api/graph/errors.go index 90ba550de..0b7793adb 100644 --- a/src/pkg/services/m365/api/graph/errors.go +++ b/src/pkg/services/m365/api/graph/errors.go @@ -114,7 +114,14 @@ const ( // ErrServiceUnavailableEmptyResp indicates the remote service returned a 503 // with an empty response body. This can sometimes happen if a request times out // during processing. -var ErrServiceUnavailableEmptyResp = clues.New("service unavailable and no returned content") +// +// TODO(ashmrtn): Either make a separate error struct for empty responses and +// implement Is() on it or start using tags on errors for the different status +// codes. +var ( + ErrServiceUnavailableEmptyResp = clues.New("service unavailable and no returned content") + ErrNotFoundEmptyResp = clues.New("not found and no returned content") +) // --------------------------------------------------------------------------- // error categorization @@ -410,9 +417,14 @@ func stackReq( // then all we get from graph SDK is an error saying "content is empty" which // isn't particularly useful. if resp != nil && - resp.ContentLength == 0 && - resp.StatusCode == http.StatusServiceUnavailable { - e = clues.Stack(ErrServiceUnavailableEmptyResp, e) + resp.ContentLength == 0 { + switch resp.StatusCode { + case http.StatusServiceUnavailable: + e = clues.Stack(ErrServiceUnavailableEmptyResp, e) + + case http.StatusNotFound: + e = clues.Stack(ErrNotFoundEmptyResp, e) + } } if e == nil { diff --git a/src/pkg/services/m365/api/graph/service.go b/src/pkg/services/m365/api/graph/service.go index 56ef60eac..748e82f1a 100644 --- a/src/pkg/services/m365/api/graph/service.go +++ b/src/pkg/services/m365/api/graph/service.go @@ -442,6 +442,13 @@ func (aw *adapterWrap) Send( // to limit the scope of this fix. logger.Ctx(ictx).Debug("invalid request") events.Inc(events.APICall, "invalidgetrequest") + } else if requestInfo.Method.String() == http.MethodGet && errors.Is(err, ErrNotFoundEmptyResp) { + // We've started seeing 404s with no content being returned for messages + // message attachments, and events. Attempting to manually fetch the items + // succeeds. Therefore we want to retry these to see if we can work around + // the problem. + logger.Ctx(ictx).Debug("404 with no content") + events.Inc(events.APICall, "notfoundnocontent") } else { // exit most errors without retry break diff --git a/src/pkg/services/m365/api/graph/service_test.go b/src/pkg/services/m365/api/graph/service_test.go index 9cb853da4..2ed7a4d38 100644 --- a/src/pkg/services/m365/api/graph/service_test.go +++ b/src/pkg/services/m365/api/graph/service_test.go @@ -12,6 +12,7 @@ import ( "time" "github.com/alcionai/clues" + "github.com/h2non/gock" "github.com/microsoftgraph/msgraph-sdk-go/models" "github.com/microsoftgraph/msgraph-sdk-go/users" "github.com/stretchr/testify/assert" @@ -26,6 +27,115 @@ import ( graphTD "github.com/alcionai/corso/src/pkg/services/m365/api/graph/testdata" ) +// --------------------------------------------------------------------------- +// Unit tests +// --------------------------------------------------------------------------- + +type GraphUnitSuite struct { + tester.Suite +} + +func TestGraphUnitSuite(t *testing.T) { + suite.Run(t, &GraphUnitSuite{ + Suite: tester.NewUnitSuite(t), + }) +} + +func (suite *GraphUnitSuite) TestNoRetryPostNoContent404() { + const ( + host = "https://graph.microsoft.com" + retries = 3 + ) + + t := suite.T() + + ctx, flush := tester.NewContext(t) + t.Cleanup(flush) + + a := tconfig.NewFakeM365Account(t) + creds, err := a.M365Config() + require.NoError(t, err, clues.ToCore(err)) + + // Run with a single retry since 503 retries are exponential and + // the test will take a long time to run. + service, err := NewGockService( + creds, + count.New(), + MaxRetries(1), + MaxConnectionRetries(retries)) + require.NoError(t, err, clues.ToCore(err)) + + t.Cleanup(gock.Off) + + gock.New(host). + Post("/v1.0/users"). + Reply(http.StatusNotFound). + BodyString(""). + Type("text/plain") + + // Since we're retrying all 404s with no content the endpoint we use doesn't + // matter. + _, err = service.Client().Users().Post(ctx, models.NewUser(), nil) + assert.ErrorIs(t, err, ErrNotFoundEmptyResp) + + assert.False(t, gock.IsPending(), "some requests not seen") +} + +func (suite *GraphUnitSuite) TestRetryGetNoContent404() { + const ( + host = "https://graph.microsoft.com" + retries = 3 + + emptyUserList = `{ +"@odata.context": "https://graph.microsoft.com/v1.0/$metadata#users", +"value": [] +}` + ) + + t := suite.T() + + ctx, flush := tester.NewContext(t) + t.Cleanup(flush) + + a := tconfig.NewFakeM365Account(t) + creds, err := a.M365Config() + require.NoError(t, err, clues.ToCore(err)) + + // Run with a single retry since 503 retries are exponential and + // the test will take a long time to run. + service, err := NewGockService( + creds, + count.New(), + MaxRetries(1), + MaxConnectionRetries(retries)) + require.NoError(t, err, clues.ToCore(err)) + + t.Cleanup(gock.Off) + + gock.New(host). + Get("/v1.0/users"). + Times(retries - 1). + Reply(http.StatusNotFound). + BodyString(""). + Type("text/plain") + + gock.New(host). + Get("/v1.0/users"). + Reply(http.StatusOK). + JSON(emptyUserList) + + // Since we're retrying all 404s with no content the endpoint we use doesn't + // matter. + _, err = service.Client().Users().Get(ctx, nil) + assert.NoError(t, err, clues.ToCore(err)) + + assert.False(t, gock.IsPending(), "some requests not seen") +} + +// --------------------------------------------------------------------------- +// Integration tests +// --------------------------------------------------------------------------- + type GraphIntgSuite struct { tester.Suite fakeCredentials account.M365Config