From 446ccfe4912bd7bf334bff46c69f3260e010159d Mon Sep 17 00:00:00 2001 From: Abin Simon Date: Fri, 31 Mar 2023 10:13:34 +0530 Subject: [PATCH] Skip large OneNote items that cannot be downloaded (#2925) OneNote files >2GB cannot be downloaded via the graph API. This change will gracefully skip them and prevent the backup from erroring out. A OneNote file is represented by a Package(folder) which contains two file within it, `.one` and `.onetoc2`. From what I can tell the `onetoc2` file is the metadata file and the `one` file is the one that contains data and can get big. **The current implementation has a limitation(or feature depending on how you see it) in that we will end up doing a partial backup with just the `onetoc2` file but avoiding the big `one` file if that is big.** I did not try to change the behaviour as even currently if for somehow we ended up with an error like 404 when trying to backup one of these files, we do skip it. I've added a comment in for now, let me know if we should go back and revisit the behaviour. *The main issue is the fact that we only hit this problem once it reaches kopia and removing it at that point is harder.* --- #### Does this PR need a docs update or release note? - [x] :white_check_mark: Yes, it's included - [ ] :clock1: Yes, but in a later PR - [ ] :no_entry: No #### Type of change - [ ] :sunflower: Feature - [x] :bug: Bugfix - [ ] :world_map: Documentation - [ ] :robot: Supportability/Tests - [ ] :computer: CI/Deployment - [ ] :broom: Tech Debt/Cleanup #### Issue(s) * fixes https://github.com/alcionai/corso/issues/2910 #### Test Plan - [ ] :muscle: Manual - [x] :zap: Unit test - [ ] :green_heart: E2E --- CHANGELOG.md | 2 + src/internal/connector/onedrive/collection.go | 30 ++++- .../connector/onedrive/collection_test.go | 124 ++++++++++++++++++ .../connector/onedrive/collections.go | 22 ++++ src/internal/stats/stats.go | 7 +- src/pkg/backup/backup.go | 38 ++++-- src/pkg/backup/backup_test.go | 23 +++- src/pkg/fault/item.go | 7 + 8 files changed, 230 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b65e0ccbf..357e5f8f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Enable compression for all data uploaded by kopia. - SharePoint --folder selectors correctly return items. - Fix Exchange cli args for filtering items +- Skip huge OneNote items that Graph API prevents us from downloading +- Skip OneNote items bigger than 2GB (Graph API prevents us from downloading them) ## [v0.6.1] (beta) - 2023-03-21 diff --git a/src/internal/connector/onedrive/collection.go b/src/internal/connector/onedrive/collection.go index 5ac7a14dd..25df599fc 100644 --- a/src/internal/connector/onedrive/collection.go +++ b/src/internal/connector/onedrive/collection.go @@ -41,6 +41,9 @@ const ( MetaFileSuffix = ".meta" DirMetaFileSuffix = ".dirmeta" DataFileSuffix = ".data" + + // Used to compare in case of OneNote files + MaxOneNoteFileSize = 2 * 1024 * 1024 * 1024 ) func IsMetaFile(name string) bool { @@ -96,6 +99,14 @@ type Collection struct { // Specifies if it new, moved/rename or deleted state data.CollectionState + // scope specifies what scope the items in a collection belongs + // to. This is primarily useful when dealing with a "package", + // like in the case of a OneNote file. A OneNote file is a + // collection with a package scope and multiple files in it. Most + // other collections have a scope of folder to indicate that the + // files within them belong to a folder. + scope collectionScope + // should only be true if the old delta token expired doNotMergeItems bool } @@ -134,6 +145,7 @@ func NewCollection( statusUpdater support.StatusUpdater, source driveSource, ctrlOpts control.Options, + colScope collectionScope, doNotMergeItems bool, ) *Collection { c := &Collection{ @@ -148,6 +160,7 @@ func NewCollection( statusUpdater: statusUpdater, ctrl: ctrlOpts, state: data.StateOf(prevPath, folderPath), + scope: colScope, doNotMergeItems: doNotMergeItems, } @@ -345,12 +358,27 @@ func (oc *Collection) getDriveItemContent( } if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) { - logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Error("item not found") + logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Info("item not found") el.AddSkip(fault.FileSkip(fault.SkipNotFound, itemID, itemName, graph.ItemInfo(item))) return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable) } + // Skip big OneNote files as they can't be downloaded + if clues.HasLabel(err, graph.LabelStatus(http.StatusServiceUnavailable)) && + oc.scope == CollectionScopePackage && *item.GetSize() >= MaxOneNoteFileSize { + // FIXME: It is possible that in case of a OneNote file we + // will end up just backing up the `onetoc2` file without + // the one file which is the important part of the OneNote + // "item". This will have to be handled during the + // restore, or we have to handle it separately by somehow + // deleting the entire collection. + logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipBigOneNote).Info("max OneNote file size exceeded") + el.AddSkip(fault.FileSkip(fault.SkipBigOneNote, itemID, itemName, graph.ItemInfo(item))) + + return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable) + } + logger.CtxErr(ctx, err).Error("downloading item") el.AddRecoverable(clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation)) diff --git a/src/internal/connector/onedrive/collection_test.go b/src/internal/connector/onedrive/collection_test.go index dbf5d76a3..e870f94e3 100644 --- a/src/internal/connector/onedrive/collection_test.go +++ b/src/internal/connector/onedrive/collection_test.go @@ -214,6 +214,7 @@ func (suite *CollectionUnitTestSuite) TestCollection() { suite.testStatusUpdater(&wg, &collStatus), test.source, control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}}, + CollectionScopeFolder, true) require.NotNil(t, coll) assert.Equal(t, folderPath, coll.FullPath()) @@ -353,6 +354,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionReadError() { suite.testStatusUpdater(&wg, &collStatus), test.source, control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}}, + CollectionScopeFolder, true) mockItem := models.NewDriveItem() @@ -442,6 +444,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionReadUnauthorizedErrorRetry() suite.testStatusUpdater(&wg, &collStatus), test.source, control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}}, + CollectionScopeFolder, true) mockItem := models.NewDriveItem() @@ -541,6 +544,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionPermissionBackupLatestModTim suite.testStatusUpdater(&wg, &collStatus), test.source, control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}}, + CollectionScopeFolder, true) mtime := time.Now().AddDate(0, -1, 0) @@ -597,3 +601,123 @@ func (suite *CollectionUnitTestSuite) TestCollectionPermissionBackupLatestModTim }) } } + +type GetDriveItemUnitTestSuite struct { + tester.Suite +} + +func TestGetDriveItemUnitTestSuite(t *testing.T) { + suite.Run(t, &GetDriveItemUnitTestSuite{Suite: tester.NewUnitSuite(t)}) +} + +func (suite *GetDriveItemUnitTestSuite) TestGetDriveItemError() { + strval := "not-important" + + table := []struct { + name string + colScope collectionScope + itemSize int64 + labels []string + err error + }{ + { + name: "Simple item fetch no error", + colScope: CollectionScopeFolder, + itemSize: 10, + err: nil, + }, + { + name: "Simple item fetch error", + colScope: CollectionScopeFolder, + itemSize: 10, + err: assert.AnError, + }, + { + name: "malware error", + colScope: CollectionScopeFolder, + itemSize: 10, + err: clues.New("test error").Label(graph.LabelsMalware), + labels: []string{graph.LabelsMalware, graph.LabelsSkippable}, + }, + { + name: "file not found error", + colScope: CollectionScopeFolder, + itemSize: 10, + err: clues.New("test error").Label(graph.LabelStatus(http.StatusNotFound)), + labels: []string{graph.LabelStatus(http.StatusNotFound), graph.LabelsSkippable}, + }, + { + // This should create an error that stops the backup + name: "small OneNote file", + colScope: CollectionScopePackage, + itemSize: 10, + err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)), + labels: []string{graph.LabelStatus(http.StatusServiceUnavailable)}, + }, + { + name: "big OneNote file", + colScope: CollectionScopePackage, + itemSize: MaxOneNoteFileSize, + err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)), + labels: []string{graph.LabelStatus(http.StatusServiceUnavailable), graph.LabelsSkippable}, + }, + { + // This should block backup, only big OneNote files should be a problem + name: "big file", + colScope: CollectionScopeFolder, + itemSize: MaxOneNoteFileSize, + err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)), + labels: []string{graph.LabelStatus(http.StatusServiceUnavailable)}, + }, + } + + for _, test := range table { + suite.Run(test.name, func() { + ctx, flush := tester.NewContext() + defer flush() + + var ( + t = suite.T() + errs = fault.New(false) + item = models.NewDriveItem() + col = &Collection{scope: test.colScope} + ) + + item.SetId(&strval) + item.SetName(&strval) + item.SetSize(&test.itemSize) + + col.itemReader = func( + ctx context.Context, + hc *http.Client, + item models.DriveItemable, + ) (details.ItemInfo, io.ReadCloser, error) { + return details.ItemInfo{}, nil, test.err + } + + col.itemGetter = func( + ctx context.Context, + srv graph.Servicer, + driveID, itemID string, + ) (models.DriveItemable, error) { + // We are not testing this err here + return item, nil + } + + _, err := col.getDriveItemContent(ctx, item, errs) + if test.err == nil { + assert.NoError(t, err, "no error") + return + } + + assert.EqualError(t, err, clues.Wrap(test.err, "downloading item").Error(), "error") + + labelsMap := map[string]struct{}{} + for _, l := range test.labels { + labelsMap[l] = struct{}{} + } + + assert.Equal(t, labelsMap, clues.Labels(err)) + }) + } +} diff --git a/src/internal/connector/onedrive/collections.go b/src/internal/connector/onedrive/collections.go index acab9e573..70ad1be57 100644 --- a/src/internal/connector/onedrive/collections.go +++ b/src/internal/connector/onedrive/collections.go @@ -33,6 +33,20 @@ const ( SharePointSource ) +type collectionScope int + +const ( + // CollectionScopeUnknown is used when we don't know and don't need + // to know the kind, like in the case of deletes + CollectionScopeUnknown collectionScope = iota + + // CollectionScopeFolder is used for regular folder collections + CollectionScopeFolder + + // CollectionScopePackage is used to represent OneNote items + CollectionScopePackage +) + const ( restrictedDirectory = "Site Pages" rootDrivePattern = "/drives/%s/root:" @@ -411,6 +425,7 @@ func (c *Collections) Get( c.statusUpdater, c.source, c.ctrl, + CollectionScopeUnknown, true) c.CollectionMap[driveID][fldID] = col @@ -572,6 +587,7 @@ func (c *Collections) handleDelete( c.statusUpdater, c.source, c.ctrl, + CollectionScopeUnknown, // DoNotMerge is not checked for deleted items. false) @@ -744,6 +760,11 @@ func (c *Collections) UpdateCollections( continue } + colScope := CollectionScopeFolder + if item.GetPackage() != nil { + colScope = CollectionScopePackage + } + col := NewCollection( c.itemClient, collectionPath, @@ -753,6 +774,7 @@ func (c *Collections) UpdateCollections( c.statusUpdater, c.source, c.ctrl, + colScope, invalidPrevDelta, ) col.driveName = driveName diff --git a/src/internal/stats/stats.go b/src/internal/stats/stats.go index 32402221d..c061e67bc 100644 --- a/src/internal/stats/stats.go +++ b/src/internal/stats/stats.go @@ -30,7 +30,8 @@ func (bc *ByteCounter) Count(i int64) { } type SkippedCounts struct { - TotalSkippedItems int `json:"totalSkippedItems"` - SkippedMalware int `json:"skippedMalware"` - SkippedNotFound int `json:"skippedNotFound"` + TotalSkippedItems int `json:"totalSkippedItems"` + SkippedMalware int `json:"skippedMalware"` + SkippedNotFound int `json:"skippedNotFound"` + SkippedInvalidOneNoteFile int `json:"skippedInvalidOneNoteFile"` } diff --git a/src/pkg/backup/backup.go b/src/pkg/backup/backup.go index 24f02992a..c8beef662 100644 --- a/src/pkg/backup/backup.go +++ b/src/pkg/backup/backup.go @@ -3,6 +3,7 @@ package backup import ( "context" "fmt" + "strings" "time" "github.com/alcionai/corso/src/cli/print" @@ -75,10 +76,12 @@ func New( } var ( - errCount = len(fe.Items) - skipCount = len(fe.Skipped) - failMsg string - malware, notFound, otherSkips int + errCount = len(fe.Items) + skipCount = len(fe.Skipped) + failMsg string + + malware, notFound, + invalidONFile, otherSkips int ) if fe.Failure != nil { @@ -92,6 +95,8 @@ func New( malware++ case s.HasCause(fault.SkipNotFound): notFound++ + case s.HasCause(fault.SkipBigOneNote): + invalidONFile++ default: otherSkips++ } @@ -121,9 +126,10 @@ func New( ReadWrites: rw, StartAndEndTime: se, SkippedCounts: stats.SkippedCounts{ - TotalSkippedItems: skipCount, - SkippedMalware: malware, - SkippedNotFound: notFound, + TotalSkippedItems: skipCount, + SkippedMalware: malware, + SkippedNotFound: notFound, + SkippedInvalidOneNoteFile: invalidONFile, }, } } @@ -211,23 +217,27 @@ func (b Backup) Values() []string { if b.TotalSkippedItems > 0 { status += fmt.Sprintf("%d skipped", b.TotalSkippedItems) - if b.SkippedMalware+b.SkippedNotFound > 0 { + if b.SkippedMalware+b.SkippedNotFound+b.SkippedInvalidOneNoteFile > 0 { status += ": " } } - if b.SkippedMalware > 0 { - status += fmt.Sprintf("%d malware", b.SkippedMalware) + skipped := []string{} - if b.SkippedNotFound > 0 { - status += ", " - } + if b.SkippedMalware > 0 { + skipped = append(skipped, fmt.Sprintf("%d malware", b.SkippedMalware)) } if b.SkippedNotFound > 0 { - status += fmt.Sprintf("%d not found", b.SkippedNotFound) + skipped = append(skipped, fmt.Sprintf("%d not found", b.SkippedNotFound)) } + if b.SkippedInvalidOneNoteFile > 0 { + skipped = append(skipped, fmt.Sprintf("%d invalid OneNote file", b.SkippedInvalidOneNoteFile)) + } + + status += strings.Join(skipped, ", ") + if errCount+b.TotalSkippedItems > 0 { status += (")") } diff --git a/src/pkg/backup/backup_test.go b/src/pkg/backup/backup_test.go index 570bcf187..59beeb313 100644 --- a/src/pkg/backup/backup_test.go +++ b/src/pkg/backup/backup_test.go @@ -153,17 +153,30 @@ func (suite *BackupUnitSuite) TestBackup_Values_statusVariations() { expect: "test (42 errors, 1 skipped: 1 not found)", }, { - name: "errors, malware, notFound", + name: "errors and invalid OneNote", bup: backup.Backup{ Status: "test", ErrorCount: 42, SkippedCounts: stats.SkippedCounts{ - TotalSkippedItems: 1, - SkippedMalware: 1, - SkippedNotFound: 1, + TotalSkippedItems: 1, + SkippedInvalidOneNoteFile: 1, }, }, - expect: "test (42 errors, 1 skipped: 1 malware, 1 not found)", + expect: "test (42 errors, 1 skipped: 1 invalid OneNote file)", + }, + { + name: "errors, malware, notFound, invalid OneNote", + bup: backup.Backup{ + Status: "test", + ErrorCount: 42, + SkippedCounts: stats.SkippedCounts{ + TotalSkippedItems: 1, + SkippedMalware: 1, + SkippedNotFound: 1, + SkippedInvalidOneNoteFile: 1, + }, + }, + expect: "test (42 errors, 1 skipped: 1 malware, 1 not found, 1 invalid OneNote file)", }, } for _, test := range table { diff --git a/src/pkg/fault/item.go b/src/pkg/fault/item.go index 83bdf48c1..551fb01fb 100644 --- a/src/pkg/fault/item.go +++ b/src/pkg/fault/item.go @@ -156,6 +156,13 @@ const ( // SkipNotFound identifies that a file was skipped because we could // not find it when trying to download contents SkipNotFound skipCause = "file_not_found" + + // SkipBigOneNote identifies that a file was skipped because it + // was big OneNote file and we can only download OneNote files which + // are less that 2GB in size. + //nolint:lll + // https://support.microsoft.com/en-us/office/restrictions-and-limitations-in-onedrive-and-sharepoint-64883a5d-228e-48f5-b3d2-eb39e07630fa#onenotenotebooks + SkipBigOneNote skipCause = "big_one_note_file" ) var _ print.Printable = &Skipped{}