Skip items returning 404 (#2805)

Treat a 404 as an unavailable item and skip it similar to malware items.

---

#### Does this PR need a docs update or release note?

- [x]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

<!--- Please check the type of change your PR introduces: --->
- [ ] 🌻 Feature
- [x] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Test
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

<!-- Can reference multiple issues. Use one of the following "magic words" - "closes, fixes" to auto-close the Github issue. -->
* #<issue>

#### Test Plan

<!-- How will this be tested prior to merging.-->
- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
Treat a 404 as an unavailable item and skip it similar to malware items.
This commit is contained in:
Abin Simon 2023-03-15 23:22:22 +05:30 committed by GitHub
parent 766dbb2c94
commit 0125876192
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 114 additions and 33 deletions

View File

@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ### Added
- Sharepoint library (document files) support: backup, list, details, and restore. - Sharepoint library (document files) support: backup, list, details, and restore.
- OneDrive item downloads that return 404 during backup (normally due to external deletion while Corso processes) are now skipped instead of quietly dropped. These items will appear in the skipped list alongside other skipped cases such as malware detection.
### Fixed ### Fixed
- Fix repo connect not working without a config file - Fix repo connect not working without a config file

View File

@ -46,6 +46,9 @@ const (
const ( const (
LabelsMalware = "malware_detected" LabelsMalware = "malware_detected"
LabelsMysiteNotFound = "mysite_not_found" LabelsMysiteNotFound = "mysite_not_found"
// LabelsSkippable is used to determine if an error is skippable
LabelsSkippable = "skippable_errors"
) )
var ( var (
@ -56,7 +59,7 @@ var (
// Delta tokens can be desycned or expired. In either case, the token // Delta tokens can be desycned or expired. In either case, the token
// becomes invalid, and cannot be used again. // becomes invalid, and cannot be used again.
// https://learn.microsoft.com/en-us/graph/errors#code-property // https://learn.microsoft.com/en-us/graph/errors#code-property
ErrInvalidDelta = clues.New("inalid delta token") ErrInvalidDelta = clues.New("invalid delta token")
// Timeout errors are identified for tracking the need to retry calls. // Timeout errors are identified for tracking the need to retry calls.
// Other delay errors, like throttling, are already handled by the // Other delay errors, like throttling, are already handled by the
@ -255,9 +258,9 @@ func appendIf(a []any, k string, v *string) []any {
return append(a, k, *v) return append(a, k, *v)
} }
// MalwareInfo gathers potentially useful information about a malware infected // ItemInfo gathers potentially useful information about a drive item,
// drive item, and aggregates that data into a map. // and aggregates that data into a map.
func MalwareInfo(item models.DriveItemable) map[string]any { func ItemInfo(item models.DriveItemable) map[string]any {
m := map[string]any{} m := map[string]any{}
creator := item.GetCreatedByUser() creator := item.GetCreatedByUser()

View File

@ -259,5 +259,5 @@ func (suite *GraphErrorsUnitSuite) TestMalwareInfo() {
fault.AddtlMalwareDesc: malDesc, fault.AddtlMalwareDesc: malDesc,
} }
assert.Equal(suite.T(), expect, MalwareInfo(&i)) assert.Equal(suite.T(), expect, ItemInfo(&i))
} }

View File

@ -318,7 +318,6 @@ func (oc *Collection) getDriveItemContent(
// Initial try with url from delta + 2 retries // Initial try with url from delta + 2 retries
for i := 1; i <= maxDownloadRetires; i++ { for i := 1; i <= maxDownloadRetires; i++ {
_, itemData, err = oc.itemReader(ctx, oc.itemClient, item) _, itemData, err = oc.itemReader(ctx, oc.itemClient, item)
if err == nil || !graph.IsErrUnauthorized(err) { if err == nil || !graph.IsErrUnauthorized(err) {
break break
} }
@ -339,16 +338,25 @@ func (oc *Collection) getDriveItemContent(
// check for errors following retries // check for errors following retries
if err != nil { if err != nil {
if clues.HasLabel(err, graph.LabelsMalware) || (item != nil && item.GetMalware() != nil) { if clues.HasLabel(err, graph.LabelsMalware) || (item != nil && item.GetMalware() != nil) {
logger.Ctx(ctx).With("error", err.Error(), "malware", true).Error("downloading item") logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipMalware).Info("item flagged as malware")
el.AddSkip(fault.FileSkip(fault.SkipMalware, itemID, itemName, graph.MalwareInfo(item))) el.AddSkip(fault.FileSkip(fault.SkipMalware, itemID, itemName, graph.ItemInfo(item)))
} else {
logger.Ctx(ctx).With("error", err.Error()).Error("downloading item") return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable)
el.AddRecoverable(clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation))
} }
if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) {
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Error("item not found")
el.AddSkip(fault.FileSkip(fault.SkipNotFound, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable)
}
logger.CtxErr(ctx, err).Error("downloading item")
el.AddRecoverable(clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation))
// return err, not el.Err(), because the lazy reader needs to communicate to // return err, not el.Err(), because the lazy reader needs to communicate to
// the data consumer that this item is unreadable, regardless of the fault state. // the data consumer that this item is unreadable, regardless of the fault state.
return nil, err return nil, clues.Wrap(err, "downloading item")
} }
return itemData, nil return itemData, nil

View File

@ -90,7 +90,7 @@ func (suite *CollectionUnitTestSuite) TestCollection() {
itemDeets nst itemDeets nst
infoFrom func(*testing.T, details.ItemInfo) (string, string) infoFrom func(*testing.T, details.ItemInfo) (string, string)
expectErr require.ErrorAssertionFunc expectErr require.ErrorAssertionFunc
expectLabel string expectLabels []string
}{ }{
{ {
name: "oneDrive, no duplicates", name: "oneDrive, no duplicates",
@ -137,7 +137,23 @@ func (suite *CollectionUnitTestSuite) TestCollection() {
return dii.OneDrive.ItemName, dii.OneDrive.ParentPath return dii.OneDrive.ItemName, dii.OneDrive.ParentPath
}, },
expectErr: require.Error, expectErr: require.Error,
expectLabel: graph.LabelsMalware, expectLabels: []string{graph.LabelsMalware, graph.LabelsSkippable},
},
{
name: "oneDrive, not found",
numInstances: 3,
source: OneDriveSource,
itemDeets: nst{testItemName, 42, now},
// Usually `Not Found` is returned from itemGetter and not itemReader
itemReader: func(context.Context, *http.Client, models.DriveItemable) (details.ItemInfo, io.ReadCloser, error) {
return details.ItemInfo{}, nil, clues.New("test not found").Label(graph.LabelStatus(http.StatusNotFound))
},
infoFrom: func(t *testing.T, dii details.ItemInfo) (string, string) {
require.NotNil(t, dii.OneDrive)
return dii.OneDrive.ItemName, dii.OneDrive.ParentPath
},
expectErr: require.Error,
expectLabels: []string{graph.LabelStatus(http.StatusNotFound), graph.LabelsSkippable},
}, },
{ {
name: "sharePoint, no duplicates", name: "sharePoint, no duplicates",
@ -267,8 +283,8 @@ func (suite *CollectionUnitTestSuite) TestCollection() {
test.expectErr(t, err) test.expectErr(t, err)
if err != nil { if err != nil {
if len(test.expectLabel) > 0 { for _, label := range test.expectLabels {
assert.True(t, clues.HasLabel(err, test.expectLabel), "has clues label:", test.expectLabel) assert.True(t, clues.HasLabel(err, label), "has clues label:", label)
} }
return return

View File

@ -635,7 +635,7 @@ func (c *Collections) UpdateCollections(
) )
if item.GetMalware() != nil { if item.GetMalware() != nil {
addtl := graph.MalwareInfo(item) addtl := graph.ItemInfo(item)
skip := fault.FileSkip(fault.SkipMalware, itemID, itemName, addtl) skip := fault.FileSkip(fault.SkipMalware, itemID, itemName, addtl)
if isFolder { if isFolder {

View File

@ -286,11 +286,13 @@ func (cp *corsoProgress) CachedFile(fname string, size int64) {
// during the upload process. This could be from reading a file or something // during the upload process. This could be from reading a file or something
// else. // else.
func (cp *corsoProgress) Error(relpath string, err error, isIgnored bool) { func (cp *corsoProgress) Error(relpath string, err error, isIgnored bool) {
// The malware case is an artifact of being unable to skip the item // LabelsSkippable is set of malware items or not found items.
// if we catch detection at a late enough stage in collection enumeration. // The malware case is an artifact of being unable to skip the
// This is our next point of error handling, where we can identify and // item if we catch detection at a late enough stage in collection
// skip over the case. // enumeration. The not found could be items deleted in between a
if clues.HasLabel(err, graph.LabelsMalware) { // delta query and a fetch. This is our next point of error
// handling, where we can identify and skip over the case.
if clues.HasLabel(err, graph.LabelsSkippable) {
cp.incExpectedErrs() cp.incExpectedErrs()
return return
} }

View File

@ -32,4 +32,5 @@ func (bc *ByteCounter) Count(i int64) {
type SkippedCounts struct { type SkippedCounts struct {
TotalSkippedItems int `json:"totalSkippedItems"` TotalSkippedItems int `json:"totalSkippedItems"`
SkippedMalware int `json:"skippedMalware"` SkippedMalware int `json:"skippedMalware"`
SkippedNotFound int `json:"skippedNotFound"`
} }

View File

@ -65,8 +65,7 @@ func New(
// TODO: count errData.Items(), not all recovered errors. // TODO: count errData.Items(), not all recovered errors.
errCount = len(ee.Recovered) errCount = len(ee.Recovered)
failMsg string failMsg string
malware int malware, notFound, otherSkips int
otherSkips int
) )
if ee.Failure != nil { if ee.Failure != nil {
@ -75,9 +74,12 @@ func New(
} }
for _, s := range ee.Skipped { for _, s := range ee.Skipped {
if s.HasCause(fault.SkipMalware) { switch true {
case s.HasCause(fault.SkipMalware):
malware++ malware++
} else { case s.HasCause(fault.SkipNotFound):
notFound++
default:
otherSkips++ otherSkips++
} }
} }
@ -194,14 +196,22 @@ func (b Backup) Values() []string {
if b.TotalSkippedItems > 0 { if b.TotalSkippedItems > 0 {
status += fmt.Sprintf("%d skipped", b.TotalSkippedItems) status += fmt.Sprintf("%d skipped", b.TotalSkippedItems)
}
if b.TotalSkippedItems > 0 && b.SkippedMalware > 0 { if b.SkippedMalware+b.SkippedNotFound > 0 {
status += ": " status += ": "
} }
}
if b.SkippedMalware > 0 { if b.SkippedMalware > 0 {
status += fmt.Sprintf("%d malware", b.SkippedMalware) status += fmt.Sprintf("%d malware", b.SkippedMalware)
if b.SkippedNotFound > 0 {
status += ", "
}
}
if b.SkippedNotFound > 0 {
status += fmt.Sprintf("%d not found", b.SkippedNotFound)
} }
if errCount+b.TotalSkippedItems > 0 { if errCount+b.TotalSkippedItems > 0 {

View File

@ -117,6 +117,17 @@ func (suite *BackupUnitSuite) TestBackup_Values_statusVariations() {
}, },
expect: "test (2 skipped: 1 malware)", expect: "test (2 skipped: 1 malware)",
}, },
{
name: "not found",
bup: backup.Backup{
Status: "test",
SkippedCounts: stats.SkippedCounts{
TotalSkippedItems: 2,
SkippedNotFound: 1,
},
},
expect: "test (2 skipped: 1 not found)",
},
{ {
name: "errors and malware", name: "errors and malware",
bup: backup.Backup{ bup: backup.Backup{
@ -129,6 +140,31 @@ func (suite *BackupUnitSuite) TestBackup_Values_statusVariations() {
}, },
expect: "test (42 errors, 1 skipped: 1 malware)", expect: "test (42 errors, 1 skipped: 1 malware)",
}, },
{
name: "errors and not found",
bup: backup.Backup{
Status: "test",
ErrorCount: 42,
SkippedCounts: stats.SkippedCounts{
TotalSkippedItems: 1,
SkippedNotFound: 1,
},
},
expect: "test (42 errors, 1 skipped: 1 not found)",
},
{
name: "errors, malware, notFound",
bup: backup.Backup{
Status: "test",
ErrorCount: 42,
SkippedCounts: stats.SkippedCounts{
TotalSkippedItems: 1,
SkippedMalware: 1,
SkippedNotFound: 1,
},
},
expect: "test (42 errors, 1 skipped: 1 malware, 1 not found)",
},
} }
for _, test := range table { for _, test := range table {
suite.Run(test.name, func() { suite.Run(test.name, func() {

View File

@ -109,6 +109,10 @@ type skipCause string
// fail any attempts to backup or restore. // fail any attempts to backup or restore.
const SkipMalware skipCause = "malware_detected" const SkipMalware skipCause = "malware_detected"
// SkipNotFound identifies that a file was skipped because we could
// not find it when trying to download contents
const SkipNotFound skipCause = "file_not_found"
// Skipped items are permanently unprocessable due to well-known conditions. // Skipped items are permanently unprocessable due to well-known conditions.
// In order to skip an item, the following conditions should be met: // In order to skip an item, the following conditions should be met:
// 1. The conditions for skipping the item are well-known and // 1. The conditions for skipping the item are well-known and