retry on http/2 stream errors (#3802)

http/2 stream error failures don't allow
us to leverage client middleware.  As a
result we're lacking a retry handler.  This
change adds retries for http_wrapper
in the explicit case of a http/2 stream
error.

---

#### Does this PR need a docs update or release note?

- [x]  Yes, it's included

#### Type of change

- [x] 🐛 Bugfix

#### Test Plan

- [x] 💪 Manual
- [x] 💚 E2E
This commit is contained in:
Keepers 2023-07-14 16:57:51 -06:00 committed by GitHub
parent ab1cb1ebbc
commit fd6dff3270
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 49 additions and 17 deletions

View File

@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Return a ServiceNotEnabled error when a tenant has no active SharePoint license.
- Added retries for http/2 stream connection failures when downloading large item content.
### Known issues
- If a link share is created for an item with inheritance disabled

View File

@ -194,13 +194,13 @@ func getDriveItemContent(
resp, err := gr.Request(ctx, http.MethodGet, *url, nil, nil)
if err != nil {
return nil, clues.New("downloading item").With("error", err)
return nil, clues.New("requesting item content").With("error", err)
}
defer resp.Body.Close()
content, err := io.ReadAll(resp.Body)
if err != nil {
return nil, clues.New("read downloaded item").With("error", err)
return nil, clues.New("reading item content").With("error", err)
}
return content, nil

View File

@ -4,11 +4,15 @@ import (
"context"
"io"
"net/http"
"regexp"
"time"
"github.com/alcionai/clues"
khttp "github.com/microsoft/kiota-http-go"
"github.com/alcionai/corso/src/internal/events"
"github.com/alcionai/corso/src/internal/version"
"github.com/alcionai/corso/src/pkg/logger"
)
// ---------------------------------------------------------------------------
@ -70,6 +74,8 @@ func NewNoTimeoutHTTPWrapper(opts ...Option) *httpWrapper {
// requests
// ---------------------------------------------------------------------------
var streamErrRE = regexp.MustCompile(`stream error: stream ID \d+; .+; received from peer`)
// Request does the provided request.
func (hw httpWrapper) Request(
ctx context.Context,
@ -91,7 +97,35 @@ func (hw httpWrapper) Request(
// See https://learn.microsoft.com/en-us/sharepoint/dev/general-development/how-to-avoid-getting-throttled-or-blocked-in-sharepoint-online#how-to-decorate-your-http-traffic
req.Header.Set("User-Agent", "ISV|Alcion|Corso/"+version.Version)
resp, err := hw.client.Do(req)
var resp *http.Response
i := 0
// stream errors from http/2 will fail before we reach
// client middleware handling, therefore we don't get to
// make use of the retry middleware. This external
// retry wrapper is unsophisticated, but should only
// retry in the event of a `stream error`, which is not
// a common expectation.
for i < 3 {
ictx := clues.Add(ctx, "request_retry_iter", i)
resp, err = hw.client.Do(req)
if err != nil && !streamErrRE.MatchString(err.Error()) {
return nil, Stack(ictx, err)
}
if err == nil {
break
}
logger.Ctx(ictx).Debug("retrying after stream error")
events.Inc(events.APICall, "streamerror")
time.Sleep(3 * time.Second)
i++
}
if err != nil {
return nil, Stack(ctx, err)
}

View File

@ -271,21 +271,20 @@ func (oc *Collection) getDriveItemContent(
var (
itemID = ptr.Val(item.GetId())
itemName = ptr.Val(item.GetName())
el = errs.Local()
)
itemData, err := downloadContent(ctx, oc.handler, oc.urlCache, item, oc.driveID)
if err != nil {
if clues.HasLabel(err, graph.LabelsMalware) || (item != nil && item.GetMalware() != nil) {
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipMalware).Info("item flagged as malware")
el.AddSkip(ctx, fault.FileSkip(fault.SkipMalware, driveID, itemID, itemName, graph.ItemInfo(item)))
errs.AddSkip(ctx, fault.FileSkip(fault.SkipMalware, driveID, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "malware item").Label(graph.LabelsSkippable)
}
if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) {
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Info("item not found")
el.AddSkip(ctx, fault.FileSkip(fault.SkipNotFound, driveID, itemID, itemName, graph.ItemInfo(item)))
errs.AddSkip(ctx, fault.FileSkip(fault.SkipNotFound, driveID, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "deleted item").Label(graph.LabelsSkippable)
}
@ -300,13 +299,13 @@ func (oc *Collection) getDriveItemContent(
// restore, or we have to handle it separately by somehow
// deleting the entire collection.
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipBigOneNote).Info("max OneNote file size exceeded")
el.AddSkip(ctx, fault.FileSkip(fault.SkipBigOneNote, driveID, itemID, itemName, graph.ItemInfo(item)))
errs.AddSkip(ctx, fault.FileSkip(fault.SkipBigOneNote, driveID, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "max oneNote item").Label(graph.LabelsSkippable)
}
logger.CtxErr(ctx, err).Error("downloading item")
el.AddRecoverable(ctx, clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation))
logger.CtxErr(ctx, err).Error("downloading item content")
errs.AddRecoverable(ctx, clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation))
// return err, not el.Err(), because the lazy reader needs to communicate to
// the data consumer that this item is unreadable, regardless of the fault state.
@ -416,7 +415,6 @@ type driveStats struct {
// and uses the collection `itemReader` to read the item
func (oc *Collection) populateItems(ctx context.Context, errs *fault.Bus) {
var (
el = errs.Local()
stats driveStats
wg sync.WaitGroup
)
@ -442,7 +440,7 @@ func (oc *Collection) populateItems(ctx context.Context, errs *fault.Bus) {
defer close(semaphoreCh)
for _, item := range oc.driveItems {
if el.Failure() != nil {
if errs.Failure() != nil {
break
}
@ -481,7 +479,6 @@ func (oc *Collection) populateDriveItem(
errs *fault.Bus,
) {
var (
el = errs.Local()
itemID = ptr.Val(item.GetId())
itemName = ptr.Val(item.GetName())
itemSize = ptr.Val(item.GetSize())
@ -518,7 +515,7 @@ func (oc *Collection) populateDriveItem(
// Fetch metadata for the file
itemMeta, itemMetaSize, err = downloadItemMeta(ctx, oc.handler, oc.driveID, item)
if err != nil {
el.AddRecoverable(ctx, clues.Wrap(err, "getting item metadata").Label(fault.LabelForceNoBackupCreation))
errs.AddRecoverable(ctx, clues.Wrap(err, "getting item metadata").Label(fault.LabelForceNoBackupCreation))
return
}
@ -546,7 +543,7 @@ func (oc *Collection) populateDriveItem(
itemExtensionFactory)
if err != nil {
err := clues.Wrap(err, "adding extensions").Label(fault.LabelForceNoBackupCreation)
el.AddRecoverable(ctx, err)
errs.AddRecoverable(ctx, err)
return nil, err
}

View File

@ -137,9 +137,9 @@ func (e *Bus) logAndAddRecoverable(ctx context.Context, err error, skip int) {
isFail := e.addRecoverableErr(err)
if isFail {
log.Error("recoverable error")
log.Errorf("recoverable error: %v", err)
} else {
log.Info("recoverable error")
log.Infof("recoverable error: %v", err)
}
}

View File

@ -34,7 +34,7 @@ var serviceToPathType = map[service]path.ServiceType{
var (
ErrorBadSelectorCast = clues.New("wrong selector service type")
ErrorNoMatchingItems = clues.New("no items match the specified selectors")
ErrorNoMatchingItems = clues.New("no items match the provided selectors")
ErrorUnrecognizedService = clues.New("unrecognized service")
)