Skip items returning 404 (#2805)

Treat a 404 as an unavailable item and skip it similar to malware items.

---

#### Does this PR need a docs update or release note?

- [x]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

<!--- Please check the type of change your PR introduces: --->
- [ ] 🌻 Feature
- [x] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Test
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

<!-- Can reference multiple issues. Use one of the following "magic words" - "closes, fixes" to auto-close the Github issue. -->
* #<issue>

#### Test Plan

<!-- How will this be tested prior to merging.-->
- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
Treat a 404 as an unavailable item and skip it similar to malware items.
This commit is contained in:
Abin Simon 2023-03-15 23:22:22 +05:30 committed by GitHub
parent 766dbb2c94
commit 0125876192
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 114 additions and 33 deletions

View File

@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Sharepoint library (document files) support: backup, list, details, and restore.
- OneDrive item downloads that return 404 during backup (normally due to external deletion while Corso processes) are now skipped instead of quietly dropped. These items will appear in the skipped list alongside other skipped cases such as malware detection.
### Fixed
- Fix repo connect not working without a config file

View File

@ -46,6 +46,9 @@ const (
const (
LabelsMalware = "malware_detected"
LabelsMysiteNotFound = "mysite_not_found"
// LabelsSkippable is used to determine if an error is skippable
LabelsSkippable = "skippable_errors"
)
var (
@ -56,7 +59,7 @@ var (
// Delta tokens can be desycned or expired. In either case, the token
// becomes invalid, and cannot be used again.
// https://learn.microsoft.com/en-us/graph/errors#code-property
ErrInvalidDelta = clues.New("inalid delta token")
ErrInvalidDelta = clues.New("invalid delta token")
// Timeout errors are identified for tracking the need to retry calls.
// Other delay errors, like throttling, are already handled by the
@ -255,9 +258,9 @@ func appendIf(a []any, k string, v *string) []any {
return append(a, k, *v)
}
// MalwareInfo gathers potentially useful information about a malware infected
// drive item, and aggregates that data into a map.
func MalwareInfo(item models.DriveItemable) map[string]any {
// ItemInfo gathers potentially useful information about a drive item,
// and aggregates that data into a map.
func ItemInfo(item models.DriveItemable) map[string]any {
m := map[string]any{}
creator := item.GetCreatedByUser()

View File

@ -259,5 +259,5 @@ func (suite *GraphErrorsUnitSuite) TestMalwareInfo() {
fault.AddtlMalwareDesc: malDesc,
}
assert.Equal(suite.T(), expect, MalwareInfo(&i))
assert.Equal(suite.T(), expect, ItemInfo(&i))
}

View File

@ -318,7 +318,6 @@ func (oc *Collection) getDriveItemContent(
// Initial try with url from delta + 2 retries
for i := 1; i <= maxDownloadRetires; i++ {
_, itemData, err = oc.itemReader(ctx, oc.itemClient, item)
if err == nil || !graph.IsErrUnauthorized(err) {
break
}
@ -339,16 +338,25 @@ func (oc *Collection) getDriveItemContent(
// check for errors following retries
if err != nil {
if clues.HasLabel(err, graph.LabelsMalware) || (item != nil && item.GetMalware() != nil) {
logger.Ctx(ctx).With("error", err.Error(), "malware", true).Error("downloading item")
el.AddSkip(fault.FileSkip(fault.SkipMalware, itemID, itemName, graph.MalwareInfo(item)))
} else {
logger.Ctx(ctx).With("error", err.Error()).Error("downloading item")
el.AddRecoverable(clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation))
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipMalware).Info("item flagged as malware")
el.AddSkip(fault.FileSkip(fault.SkipMalware, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable)
}
if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) {
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Error("item not found")
el.AddSkip(fault.FileSkip(fault.SkipNotFound, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable)
}
logger.CtxErr(ctx, err).Error("downloading item")
el.AddRecoverable(clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation))
// return err, not el.Err(), because the lazy reader needs to communicate to
// the data consumer that this item is unreadable, regardless of the fault state.
return nil, err
return nil, clues.Wrap(err, "downloading item")
}
return itemData, nil

View File

@ -90,7 +90,7 @@ func (suite *CollectionUnitTestSuite) TestCollection() {
itemDeets nst
infoFrom func(*testing.T, details.ItemInfo) (string, string)
expectErr require.ErrorAssertionFunc
expectLabel string
expectLabels []string
}{
{
name: "oneDrive, no duplicates",
@ -136,8 +136,24 @@ func (suite *CollectionUnitTestSuite) TestCollection() {
require.NotNil(t, dii.OneDrive)
return dii.OneDrive.ItemName, dii.OneDrive.ParentPath
},
expectErr: require.Error,
expectLabel: graph.LabelsMalware,
expectErr: require.Error,
expectLabels: []string{graph.LabelsMalware, graph.LabelsSkippable},
},
{
name: "oneDrive, not found",
numInstances: 3,
source: OneDriveSource,
itemDeets: nst{testItemName, 42, now},
// Usually `Not Found` is returned from itemGetter and not itemReader
itemReader: func(context.Context, *http.Client, models.DriveItemable) (details.ItemInfo, io.ReadCloser, error) {
return details.ItemInfo{}, nil, clues.New("test not found").Label(graph.LabelStatus(http.StatusNotFound))
},
infoFrom: func(t *testing.T, dii details.ItemInfo) (string, string) {
require.NotNil(t, dii.OneDrive)
return dii.OneDrive.ItemName, dii.OneDrive.ParentPath
},
expectErr: require.Error,
expectLabels: []string{graph.LabelStatus(http.StatusNotFound), graph.LabelsSkippable},
},
{
name: "sharePoint, no duplicates",
@ -267,8 +283,8 @@ func (suite *CollectionUnitTestSuite) TestCollection() {
test.expectErr(t, err)
if err != nil {
if len(test.expectLabel) > 0 {
assert.True(t, clues.HasLabel(err, test.expectLabel), "has clues label:", test.expectLabel)
for _, label := range test.expectLabels {
assert.True(t, clues.HasLabel(err, label), "has clues label:", label)
}
return

View File

@ -635,7 +635,7 @@ func (c *Collections) UpdateCollections(
)
if item.GetMalware() != nil {
addtl := graph.MalwareInfo(item)
addtl := graph.ItemInfo(item)
skip := fault.FileSkip(fault.SkipMalware, itemID, itemName, addtl)
if isFolder {

View File

@ -286,11 +286,13 @@ func (cp *corsoProgress) CachedFile(fname string, size int64) {
// during the upload process. This could be from reading a file or something
// else.
func (cp *corsoProgress) Error(relpath string, err error, isIgnored bool) {
// The malware case is an artifact of being unable to skip the item
// if we catch detection at a late enough stage in collection enumeration.
// This is our next point of error handling, where we can identify and
// skip over the case.
if clues.HasLabel(err, graph.LabelsMalware) {
// LabelsSkippable is set of malware items or not found items.
// The malware case is an artifact of being unable to skip the
// item if we catch detection at a late enough stage in collection
// enumeration. The not found could be items deleted in between a
// delta query and a fetch. This is our next point of error
// handling, where we can identify and skip over the case.
if clues.HasLabel(err, graph.LabelsSkippable) {
cp.incExpectedErrs()
return
}

View File

@ -32,4 +32,5 @@ func (bc *ByteCounter) Count(i int64) {
type SkippedCounts struct {
TotalSkippedItems int `json:"totalSkippedItems"`
SkippedMalware int `json:"skippedMalware"`
SkippedNotFound int `json:"skippedNotFound"`
}

View File

@ -63,10 +63,9 @@ func New(
var (
ee = errs.Errors()
// TODO: count errData.Items(), not all recovered errors.
errCount = len(ee.Recovered)
failMsg string
malware int
otherSkips int
errCount = len(ee.Recovered)
failMsg string
malware, notFound, otherSkips int
)
if ee.Failure != nil {
@ -75,9 +74,12 @@ func New(
}
for _, s := range ee.Skipped {
if s.HasCause(fault.SkipMalware) {
switch true {
case s.HasCause(fault.SkipMalware):
malware++
} else {
case s.HasCause(fault.SkipNotFound):
notFound++
default:
otherSkips++
}
}
@ -194,14 +196,22 @@ func (b Backup) Values() []string {
if b.TotalSkippedItems > 0 {
status += fmt.Sprintf("%d skipped", b.TotalSkippedItems)
}
if b.TotalSkippedItems > 0 && b.SkippedMalware > 0 {
status += ": "
if b.SkippedMalware+b.SkippedNotFound > 0 {
status += ": "
}
}
if b.SkippedMalware > 0 {
status += fmt.Sprintf("%d malware", b.SkippedMalware)
if b.SkippedNotFound > 0 {
status += ", "
}
}
if b.SkippedNotFound > 0 {
status += fmt.Sprintf("%d not found", b.SkippedNotFound)
}
if errCount+b.TotalSkippedItems > 0 {

View File

@ -117,6 +117,17 @@ func (suite *BackupUnitSuite) TestBackup_Values_statusVariations() {
},
expect: "test (2 skipped: 1 malware)",
},
{
name: "not found",
bup: backup.Backup{
Status: "test",
SkippedCounts: stats.SkippedCounts{
TotalSkippedItems: 2,
SkippedNotFound: 1,
},
},
expect: "test (2 skipped: 1 not found)",
},
{
name: "errors and malware",
bup: backup.Backup{
@ -129,6 +140,31 @@ func (suite *BackupUnitSuite) TestBackup_Values_statusVariations() {
},
expect: "test (42 errors, 1 skipped: 1 malware)",
},
{
name: "errors and not found",
bup: backup.Backup{
Status: "test",
ErrorCount: 42,
SkippedCounts: stats.SkippedCounts{
TotalSkippedItems: 1,
SkippedNotFound: 1,
},
},
expect: "test (42 errors, 1 skipped: 1 not found)",
},
{
name: "errors, malware, notFound",
bup: backup.Backup{
Status: "test",
ErrorCount: 42,
SkippedCounts: stats.SkippedCounts{
TotalSkippedItems: 1,
SkippedMalware: 1,
SkippedNotFound: 1,
},
},
expect: "test (42 errors, 1 skipped: 1 malware, 1 not found)",
},
}
for _, test := range table {
suite.Run(test.name, func() {

View File

@ -109,6 +109,10 @@ type skipCause string
// fail any attempts to backup or restore.
const SkipMalware skipCause = "malware_detected"
// SkipNotFound identifies that a file was skipped because we could
// not find it when trying to download contents
const SkipNotFound skipCause = "file_not_found"
// Skipped items are permanently unprocessable due to well-known conditions.
// In order to skip an item, the following conditions should be met:
// 1. The conditions for skipping the item are well-known and