retry on http/2 stream errors (#3802)

http/2 stream error failures don't allow
us to leverage client middleware.  As a
result we're lacking a retry handler.  This
change adds retries for http_wrapper
in the explicit case of a http/2 stream
error.

---

#### Does this PR need a docs update or release note?

- [x]  Yes, it's included

#### Type of change

- [x] 🐛 Bugfix

#### Test Plan

- [x] 💪 Manual
- [x] 💚 E2E
This commit is contained in:
Keepers 2023-07-14 16:57:51 -06:00 committed by GitHub
parent ab1cb1ebbc
commit fd6dff3270
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 49 additions and 17 deletions

View File

@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed ### Fixed
- Return a ServiceNotEnabled error when a tenant has no active SharePoint license. - Return a ServiceNotEnabled error when a tenant has no active SharePoint license.
- Added retries for http/2 stream connection failures when downloading large item content.
### Known issues ### Known issues
- If a link share is created for an item with inheritance disabled - If a link share is created for an item with inheritance disabled

View File

@ -194,13 +194,13 @@ func getDriveItemContent(
resp, err := gr.Request(ctx, http.MethodGet, *url, nil, nil) resp, err := gr.Request(ctx, http.MethodGet, *url, nil, nil)
if err != nil { if err != nil {
return nil, clues.New("downloading item").With("error", err) return nil, clues.New("requesting item content").With("error", err)
} }
defer resp.Body.Close() defer resp.Body.Close()
content, err := io.ReadAll(resp.Body) content, err := io.ReadAll(resp.Body)
if err != nil { if err != nil {
return nil, clues.New("read downloaded item").With("error", err) return nil, clues.New("reading item content").With("error", err)
} }
return content, nil return content, nil

View File

@ -4,11 +4,15 @@ import (
"context" "context"
"io" "io"
"net/http" "net/http"
"regexp"
"time"
"github.com/alcionai/clues" "github.com/alcionai/clues"
khttp "github.com/microsoft/kiota-http-go" khttp "github.com/microsoft/kiota-http-go"
"github.com/alcionai/corso/src/internal/events"
"github.com/alcionai/corso/src/internal/version" "github.com/alcionai/corso/src/internal/version"
"github.com/alcionai/corso/src/pkg/logger"
) )
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@ -70,6 +74,8 @@ func NewNoTimeoutHTTPWrapper(opts ...Option) *httpWrapper {
// requests // requests
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
var streamErrRE = regexp.MustCompile(`stream error: stream ID \d+; .+; received from peer`)
// Request does the provided request. // Request does the provided request.
func (hw httpWrapper) Request( func (hw httpWrapper) Request(
ctx context.Context, ctx context.Context,
@ -91,7 +97,35 @@ func (hw httpWrapper) Request(
// See https://learn.microsoft.com/en-us/sharepoint/dev/general-development/how-to-avoid-getting-throttled-or-blocked-in-sharepoint-online#how-to-decorate-your-http-traffic // See https://learn.microsoft.com/en-us/sharepoint/dev/general-development/how-to-avoid-getting-throttled-or-blocked-in-sharepoint-online#how-to-decorate-your-http-traffic
req.Header.Set("User-Agent", "ISV|Alcion|Corso/"+version.Version) req.Header.Set("User-Agent", "ISV|Alcion|Corso/"+version.Version)
resp, err := hw.client.Do(req) var resp *http.Response
i := 0
// stream errors from http/2 will fail before we reach
// client middleware handling, therefore we don't get to
// make use of the retry middleware. This external
// retry wrapper is unsophisticated, but should only
// retry in the event of a `stream error`, which is not
// a common expectation.
for i < 3 {
ictx := clues.Add(ctx, "request_retry_iter", i)
resp, err = hw.client.Do(req)
if err != nil && !streamErrRE.MatchString(err.Error()) {
return nil, Stack(ictx, err)
}
if err == nil {
break
}
logger.Ctx(ictx).Debug("retrying after stream error")
events.Inc(events.APICall, "streamerror")
time.Sleep(3 * time.Second)
i++
}
if err != nil { if err != nil {
return nil, Stack(ctx, err) return nil, Stack(ctx, err)
} }

View File

@ -271,21 +271,20 @@ func (oc *Collection) getDriveItemContent(
var ( var (
itemID = ptr.Val(item.GetId()) itemID = ptr.Val(item.GetId())
itemName = ptr.Val(item.GetName()) itemName = ptr.Val(item.GetName())
el = errs.Local()
) )
itemData, err := downloadContent(ctx, oc.handler, oc.urlCache, item, oc.driveID) itemData, err := downloadContent(ctx, oc.handler, oc.urlCache, item, oc.driveID)
if err != nil { if err != nil {
if clues.HasLabel(err, graph.LabelsMalware) || (item != nil && item.GetMalware() != nil) { if clues.HasLabel(err, graph.LabelsMalware) || (item != nil && item.GetMalware() != nil) {
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipMalware).Info("item flagged as malware") logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipMalware).Info("item flagged as malware")
el.AddSkip(ctx, fault.FileSkip(fault.SkipMalware, driveID, itemID, itemName, graph.ItemInfo(item))) errs.AddSkip(ctx, fault.FileSkip(fault.SkipMalware, driveID, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "malware item").Label(graph.LabelsSkippable) return nil, clues.Wrap(err, "malware item").Label(graph.LabelsSkippable)
} }
if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) { if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) {
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Info("item not found") logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Info("item not found")
el.AddSkip(ctx, fault.FileSkip(fault.SkipNotFound, driveID, itemID, itemName, graph.ItemInfo(item))) errs.AddSkip(ctx, fault.FileSkip(fault.SkipNotFound, driveID, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "deleted item").Label(graph.LabelsSkippable) return nil, clues.Wrap(err, "deleted item").Label(graph.LabelsSkippable)
} }
@ -300,13 +299,13 @@ func (oc *Collection) getDriveItemContent(
// restore, or we have to handle it separately by somehow // restore, or we have to handle it separately by somehow
// deleting the entire collection. // deleting the entire collection.
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipBigOneNote).Info("max OneNote file size exceeded") logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipBigOneNote).Info("max OneNote file size exceeded")
el.AddSkip(ctx, fault.FileSkip(fault.SkipBigOneNote, driveID, itemID, itemName, graph.ItemInfo(item))) errs.AddSkip(ctx, fault.FileSkip(fault.SkipBigOneNote, driveID, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "max oneNote item").Label(graph.LabelsSkippable) return nil, clues.Wrap(err, "max oneNote item").Label(graph.LabelsSkippable)
} }
logger.CtxErr(ctx, err).Error("downloading item") logger.CtxErr(ctx, err).Error("downloading item content")
el.AddRecoverable(ctx, clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation)) errs.AddRecoverable(ctx, clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation))
// return err, not el.Err(), because the lazy reader needs to communicate to // return err, not el.Err(), because the lazy reader needs to communicate to
// the data consumer that this item is unreadable, regardless of the fault state. // the data consumer that this item is unreadable, regardless of the fault state.
@ -416,7 +415,6 @@ type driveStats struct {
// and uses the collection `itemReader` to read the item // and uses the collection `itemReader` to read the item
func (oc *Collection) populateItems(ctx context.Context, errs *fault.Bus) { func (oc *Collection) populateItems(ctx context.Context, errs *fault.Bus) {
var ( var (
el = errs.Local()
stats driveStats stats driveStats
wg sync.WaitGroup wg sync.WaitGroup
) )
@ -442,7 +440,7 @@ func (oc *Collection) populateItems(ctx context.Context, errs *fault.Bus) {
defer close(semaphoreCh) defer close(semaphoreCh)
for _, item := range oc.driveItems { for _, item := range oc.driveItems {
if el.Failure() != nil { if errs.Failure() != nil {
break break
} }
@ -481,7 +479,6 @@ func (oc *Collection) populateDriveItem(
errs *fault.Bus, errs *fault.Bus,
) { ) {
var ( var (
el = errs.Local()
itemID = ptr.Val(item.GetId()) itemID = ptr.Val(item.GetId())
itemName = ptr.Val(item.GetName()) itemName = ptr.Val(item.GetName())
itemSize = ptr.Val(item.GetSize()) itemSize = ptr.Val(item.GetSize())
@ -518,7 +515,7 @@ func (oc *Collection) populateDriveItem(
// Fetch metadata for the file // Fetch metadata for the file
itemMeta, itemMetaSize, err = downloadItemMeta(ctx, oc.handler, oc.driveID, item) itemMeta, itemMetaSize, err = downloadItemMeta(ctx, oc.handler, oc.driveID, item)
if err != nil { if err != nil {
el.AddRecoverable(ctx, clues.Wrap(err, "getting item metadata").Label(fault.LabelForceNoBackupCreation)) errs.AddRecoverable(ctx, clues.Wrap(err, "getting item metadata").Label(fault.LabelForceNoBackupCreation))
return return
} }
@ -546,7 +543,7 @@ func (oc *Collection) populateDriveItem(
itemExtensionFactory) itemExtensionFactory)
if err != nil { if err != nil {
err := clues.Wrap(err, "adding extensions").Label(fault.LabelForceNoBackupCreation) err := clues.Wrap(err, "adding extensions").Label(fault.LabelForceNoBackupCreation)
el.AddRecoverable(ctx, err) errs.AddRecoverable(ctx, err)
return nil, err return nil, err
} }

View File

@ -137,9 +137,9 @@ func (e *Bus) logAndAddRecoverable(ctx context.Context, err error, skip int) {
isFail := e.addRecoverableErr(err) isFail := e.addRecoverableErr(err)
if isFail { if isFail {
log.Error("recoverable error") log.Errorf("recoverable error: %v", err)
} else { } else {
log.Info("recoverable error") log.Infof("recoverable error: %v", err)
} }
} }

View File

@ -34,7 +34,7 @@ var serviceToPathType = map[service]path.ServiceType{
var ( var (
ErrorBadSelectorCast = clues.New("wrong selector service type") ErrorBadSelectorCast = clues.New("wrong selector service type")
ErrorNoMatchingItems = clues.New("no items match the specified selectors") ErrorNoMatchingItems = clues.New("no items match the provided selectors")
ErrorUnrecognizedService = clues.New("unrecognized service") ErrorUnrecognizedService = clues.New("unrecognized service")
) )