diff --git a/CHANGELOG.md b/CHANGELOG.md index b65e0ccbf..357e5f8f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Enable compression for all data uploaded by kopia. - SharePoint --folder selectors correctly return items. - Fix Exchange cli args for filtering items +- Skip huge OneNote items that Graph API prevents us from downloading +- Skip OneNote items bigger than 2GB (Graph API prevents us from downloading them) ## [v0.6.1] (beta) - 2023-03-21 diff --git a/src/internal/connector/onedrive/collection.go b/src/internal/connector/onedrive/collection.go index 5ac7a14dd..25df599fc 100644 --- a/src/internal/connector/onedrive/collection.go +++ b/src/internal/connector/onedrive/collection.go @@ -41,6 +41,9 @@ const ( MetaFileSuffix = ".meta" DirMetaFileSuffix = ".dirmeta" DataFileSuffix = ".data" + + // Used to compare in case of OneNote files + MaxOneNoteFileSize = 2 * 1024 * 1024 * 1024 ) func IsMetaFile(name string) bool { @@ -96,6 +99,14 @@ type Collection struct { // Specifies if it new, moved/rename or deleted state data.CollectionState + // scope specifies what scope the items in a collection belongs + // to. This is primarily useful when dealing with a "package", + // like in the case of a OneNote file. A OneNote file is a + // collection with a package scope and multiple files in it. Most + // other collections have a scope of folder to indicate that the + // files within them belong to a folder. + scope collectionScope + // should only be true if the old delta token expired doNotMergeItems bool } @@ -134,6 +145,7 @@ func NewCollection( statusUpdater support.StatusUpdater, source driveSource, ctrlOpts control.Options, + colScope collectionScope, doNotMergeItems bool, ) *Collection { c := &Collection{ @@ -148,6 +160,7 @@ func NewCollection( statusUpdater: statusUpdater, ctrl: ctrlOpts, state: data.StateOf(prevPath, folderPath), + scope: colScope, doNotMergeItems: doNotMergeItems, } @@ -345,12 +358,27 @@ func (oc *Collection) getDriveItemContent( } if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) { - logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Error("item not found") + logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Info("item not found") el.AddSkip(fault.FileSkip(fault.SkipNotFound, itemID, itemName, graph.ItemInfo(item))) return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable) } + // Skip big OneNote files as they can't be downloaded + if clues.HasLabel(err, graph.LabelStatus(http.StatusServiceUnavailable)) && + oc.scope == CollectionScopePackage && *item.GetSize() >= MaxOneNoteFileSize { + // FIXME: It is possible that in case of a OneNote file we + // will end up just backing up the `onetoc2` file without + // the one file which is the important part of the OneNote + // "item". This will have to be handled during the + // restore, or we have to handle it separately by somehow + // deleting the entire collection. + logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipBigOneNote).Info("max OneNote file size exceeded") + el.AddSkip(fault.FileSkip(fault.SkipBigOneNote, itemID, itemName, graph.ItemInfo(item))) + + return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable) + } + logger.CtxErr(ctx, err).Error("downloading item") el.AddRecoverable(clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation)) diff --git a/src/internal/connector/onedrive/collection_test.go b/src/internal/connector/onedrive/collection_test.go index dbf5d76a3..e870f94e3 100644 --- a/src/internal/connector/onedrive/collection_test.go +++ b/src/internal/connector/onedrive/collection_test.go @@ -214,6 +214,7 @@ func (suite *CollectionUnitTestSuite) TestCollection() { suite.testStatusUpdater(&wg, &collStatus), test.source, control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}}, + CollectionScopeFolder, true) require.NotNil(t, coll) assert.Equal(t, folderPath, coll.FullPath()) @@ -353,6 +354,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionReadError() { suite.testStatusUpdater(&wg, &collStatus), test.source, control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}}, + CollectionScopeFolder, true) mockItem := models.NewDriveItem() @@ -442,6 +444,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionReadUnauthorizedErrorRetry() suite.testStatusUpdater(&wg, &collStatus), test.source, control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}}, + CollectionScopeFolder, true) mockItem := models.NewDriveItem() @@ -541,6 +544,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionPermissionBackupLatestModTim suite.testStatusUpdater(&wg, &collStatus), test.source, control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}}, + CollectionScopeFolder, true) mtime := time.Now().AddDate(0, -1, 0) @@ -597,3 +601,123 @@ func (suite *CollectionUnitTestSuite) TestCollectionPermissionBackupLatestModTim }) } } + +type GetDriveItemUnitTestSuite struct { + tester.Suite +} + +func TestGetDriveItemUnitTestSuite(t *testing.T) { + suite.Run(t, &GetDriveItemUnitTestSuite{Suite: tester.NewUnitSuite(t)}) +} + +func (suite *GetDriveItemUnitTestSuite) TestGetDriveItemError() { + strval := "not-important" + + table := []struct { + name string + colScope collectionScope + itemSize int64 + labels []string + err error + }{ + { + name: "Simple item fetch no error", + colScope: CollectionScopeFolder, + itemSize: 10, + err: nil, + }, + { + name: "Simple item fetch error", + colScope: CollectionScopeFolder, + itemSize: 10, + err: assert.AnError, + }, + { + name: "malware error", + colScope: CollectionScopeFolder, + itemSize: 10, + err: clues.New("test error").Label(graph.LabelsMalware), + labels: []string{graph.LabelsMalware, graph.LabelsSkippable}, + }, + { + name: "file not found error", + colScope: CollectionScopeFolder, + itemSize: 10, + err: clues.New("test error").Label(graph.LabelStatus(http.StatusNotFound)), + labels: []string{graph.LabelStatus(http.StatusNotFound), graph.LabelsSkippable}, + }, + { + // This should create an error that stops the backup + name: "small OneNote file", + colScope: CollectionScopePackage, + itemSize: 10, + err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)), + labels: []string{graph.LabelStatus(http.StatusServiceUnavailable)}, + }, + { + name: "big OneNote file", + colScope: CollectionScopePackage, + itemSize: MaxOneNoteFileSize, + err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)), + labels: []string{graph.LabelStatus(http.StatusServiceUnavailable), graph.LabelsSkippable}, + }, + { + // This should block backup, only big OneNote files should be a problem + name: "big file", + colScope: CollectionScopeFolder, + itemSize: MaxOneNoteFileSize, + err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)), + labels: []string{graph.LabelStatus(http.StatusServiceUnavailable)}, + }, + } + + for _, test := range table { + suite.Run(test.name, func() { + ctx, flush := tester.NewContext() + defer flush() + + var ( + t = suite.T() + errs = fault.New(false) + item = models.NewDriveItem() + col = &Collection{scope: test.colScope} + ) + + item.SetId(&strval) + item.SetName(&strval) + item.SetSize(&test.itemSize) + + col.itemReader = func( + ctx context.Context, + hc *http.Client, + item models.DriveItemable, + ) (details.ItemInfo, io.ReadCloser, error) { + return details.ItemInfo{}, nil, test.err + } + + col.itemGetter = func( + ctx context.Context, + srv graph.Servicer, + driveID, itemID string, + ) (models.DriveItemable, error) { + // We are not testing this err here + return item, nil + } + + _, err := col.getDriveItemContent(ctx, item, errs) + if test.err == nil { + assert.NoError(t, err, "no error") + return + } + + assert.EqualError(t, err, clues.Wrap(test.err, "downloading item").Error(), "error") + + labelsMap := map[string]struct{}{} + for _, l := range test.labels { + labelsMap[l] = struct{}{} + } + + assert.Equal(t, labelsMap, clues.Labels(err)) + }) + } +} diff --git a/src/internal/connector/onedrive/collections.go b/src/internal/connector/onedrive/collections.go index acab9e573..70ad1be57 100644 --- a/src/internal/connector/onedrive/collections.go +++ b/src/internal/connector/onedrive/collections.go @@ -33,6 +33,20 @@ const ( SharePointSource ) +type collectionScope int + +const ( + // CollectionScopeUnknown is used when we don't know and don't need + // to know the kind, like in the case of deletes + CollectionScopeUnknown collectionScope = iota + + // CollectionScopeFolder is used for regular folder collections + CollectionScopeFolder + + // CollectionScopePackage is used to represent OneNote items + CollectionScopePackage +) + const ( restrictedDirectory = "Site Pages" rootDrivePattern = "/drives/%s/root:" @@ -411,6 +425,7 @@ func (c *Collections) Get( c.statusUpdater, c.source, c.ctrl, + CollectionScopeUnknown, true) c.CollectionMap[driveID][fldID] = col @@ -572,6 +587,7 @@ func (c *Collections) handleDelete( c.statusUpdater, c.source, c.ctrl, + CollectionScopeUnknown, // DoNotMerge is not checked for deleted items. false) @@ -744,6 +760,11 @@ func (c *Collections) UpdateCollections( continue } + colScope := CollectionScopeFolder + if item.GetPackage() != nil { + colScope = CollectionScopePackage + } + col := NewCollection( c.itemClient, collectionPath, @@ -753,6 +774,7 @@ func (c *Collections) UpdateCollections( c.statusUpdater, c.source, c.ctrl, + colScope, invalidPrevDelta, ) col.driveName = driveName diff --git a/src/internal/stats/stats.go b/src/internal/stats/stats.go index 32402221d..c061e67bc 100644 --- a/src/internal/stats/stats.go +++ b/src/internal/stats/stats.go @@ -30,7 +30,8 @@ func (bc *ByteCounter) Count(i int64) { } type SkippedCounts struct { - TotalSkippedItems int `json:"totalSkippedItems"` - SkippedMalware int `json:"skippedMalware"` - SkippedNotFound int `json:"skippedNotFound"` + TotalSkippedItems int `json:"totalSkippedItems"` + SkippedMalware int `json:"skippedMalware"` + SkippedNotFound int `json:"skippedNotFound"` + SkippedInvalidOneNoteFile int `json:"skippedInvalidOneNoteFile"` } diff --git a/src/pkg/backup/backup.go b/src/pkg/backup/backup.go index 24f02992a..c8beef662 100644 --- a/src/pkg/backup/backup.go +++ b/src/pkg/backup/backup.go @@ -3,6 +3,7 @@ package backup import ( "context" "fmt" + "strings" "time" "github.com/alcionai/corso/src/cli/print" @@ -75,10 +76,12 @@ func New( } var ( - errCount = len(fe.Items) - skipCount = len(fe.Skipped) - failMsg string - malware, notFound, otherSkips int + errCount = len(fe.Items) + skipCount = len(fe.Skipped) + failMsg string + + malware, notFound, + invalidONFile, otherSkips int ) if fe.Failure != nil { @@ -92,6 +95,8 @@ func New( malware++ case s.HasCause(fault.SkipNotFound): notFound++ + case s.HasCause(fault.SkipBigOneNote): + invalidONFile++ default: otherSkips++ } @@ -121,9 +126,10 @@ func New( ReadWrites: rw, StartAndEndTime: se, SkippedCounts: stats.SkippedCounts{ - TotalSkippedItems: skipCount, - SkippedMalware: malware, - SkippedNotFound: notFound, + TotalSkippedItems: skipCount, + SkippedMalware: malware, + SkippedNotFound: notFound, + SkippedInvalidOneNoteFile: invalidONFile, }, } } @@ -211,23 +217,27 @@ func (b Backup) Values() []string { if b.TotalSkippedItems > 0 { status += fmt.Sprintf("%d skipped", b.TotalSkippedItems) - if b.SkippedMalware+b.SkippedNotFound > 0 { + if b.SkippedMalware+b.SkippedNotFound+b.SkippedInvalidOneNoteFile > 0 { status += ": " } } - if b.SkippedMalware > 0 { - status += fmt.Sprintf("%d malware", b.SkippedMalware) + skipped := []string{} - if b.SkippedNotFound > 0 { - status += ", " - } + if b.SkippedMalware > 0 { + skipped = append(skipped, fmt.Sprintf("%d malware", b.SkippedMalware)) } if b.SkippedNotFound > 0 { - status += fmt.Sprintf("%d not found", b.SkippedNotFound) + skipped = append(skipped, fmt.Sprintf("%d not found", b.SkippedNotFound)) } + if b.SkippedInvalidOneNoteFile > 0 { + skipped = append(skipped, fmt.Sprintf("%d invalid OneNote file", b.SkippedInvalidOneNoteFile)) + } + + status += strings.Join(skipped, ", ") + if errCount+b.TotalSkippedItems > 0 { status += (")") } diff --git a/src/pkg/backup/backup_test.go b/src/pkg/backup/backup_test.go index 570bcf187..59beeb313 100644 --- a/src/pkg/backup/backup_test.go +++ b/src/pkg/backup/backup_test.go @@ -153,17 +153,30 @@ func (suite *BackupUnitSuite) TestBackup_Values_statusVariations() { expect: "test (42 errors, 1 skipped: 1 not found)", }, { - name: "errors, malware, notFound", + name: "errors and invalid OneNote", bup: backup.Backup{ Status: "test", ErrorCount: 42, SkippedCounts: stats.SkippedCounts{ - TotalSkippedItems: 1, - SkippedMalware: 1, - SkippedNotFound: 1, + TotalSkippedItems: 1, + SkippedInvalidOneNoteFile: 1, }, }, - expect: "test (42 errors, 1 skipped: 1 malware, 1 not found)", + expect: "test (42 errors, 1 skipped: 1 invalid OneNote file)", + }, + { + name: "errors, malware, notFound, invalid OneNote", + bup: backup.Backup{ + Status: "test", + ErrorCount: 42, + SkippedCounts: stats.SkippedCounts{ + TotalSkippedItems: 1, + SkippedMalware: 1, + SkippedNotFound: 1, + SkippedInvalidOneNoteFile: 1, + }, + }, + expect: "test (42 errors, 1 skipped: 1 malware, 1 not found, 1 invalid OneNote file)", }, } for _, test := range table { diff --git a/src/pkg/fault/item.go b/src/pkg/fault/item.go index 83bdf48c1..551fb01fb 100644 --- a/src/pkg/fault/item.go +++ b/src/pkg/fault/item.go @@ -156,6 +156,13 @@ const ( // SkipNotFound identifies that a file was skipped because we could // not find it when trying to download contents SkipNotFound skipCause = "file_not_found" + + // SkipBigOneNote identifies that a file was skipped because it + // was big OneNote file and we can only download OneNote files which + // are less that 2GB in size. + //nolint:lll + // https://support.microsoft.com/en-us/office/restrictions-and-limitations-in-onedrive-and-sharepoint-64883a5d-228e-48f5-b3d2-eb39e07630fa#onenotenotebooks + SkipBigOneNote skipCause = "big_one_note_file" ) var _ print.Printable = &Skipped{}