Skip large OneNote items that cannot be downloaded (#2925)

OneNote files >2GB cannot be downloaded via the graph API. This change will gracefully skip them and prevent the backup from erroring out.

A OneNote file is represented by a Package(folder) which contains two file within it, `<file>.one` and `<file>.onetoc2`.  From what I can tell the `onetoc2` file is the metadata file and the `one` file is the one that contains data and can get big.

**The current implementation has a limitation(or feature depending on how you see it) in that we will end up doing a partial backup with just the `onetoc2` file but avoiding the big `one` file if that is big.**

I did not try to change the behaviour as even currently if for somehow we ended up with an error like 404 when trying to backup one of these files, we do skip it. I've added a comment in for now, let me know if we should go back and revisit the behaviour. *The main issue is the fact that we only hit this problem once it reaches kopia and removing it at that point is harder.*

---

#### Does this PR need a docs update or release note?

- [x]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

<!--- Please check the type of change your PR introduces: --->
- [ ] 🌻 Feature
- [x] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

<!-- Can reference multiple issues. Use one of the following "magic words" - "closes, fixes" to auto-close the Github issue. -->
* fixes https://github.com/alcionai/corso/issues/2910

#### Test Plan

<!-- How will this be tested prior to merging.-->
- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
Abin Simon 2023-03-31 10:13:34 +05:30 committed by GitHub
parent b23faa0fac
commit 446ccfe491
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 230 additions and 23 deletions

View File

@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Enable compression for all data uploaded by kopia.
- SharePoint --folder selectors correctly return items.
- Fix Exchange cli args for filtering items
- Skip huge OneNote items that Graph API prevents us from downloading
- Skip OneNote items bigger than 2GB (Graph API prevents us from downloading them)
## [v0.6.1] (beta) - 2023-03-21

View File

@ -41,6 +41,9 @@ const (
MetaFileSuffix = ".meta"
DirMetaFileSuffix = ".dirmeta"
DataFileSuffix = ".data"
// Used to compare in case of OneNote files
MaxOneNoteFileSize = 2 * 1024 * 1024 * 1024
)
func IsMetaFile(name string) bool {
@ -96,6 +99,14 @@ type Collection struct {
// Specifies if it new, moved/rename or deleted
state data.CollectionState
// scope specifies what scope the items in a collection belongs
// to. This is primarily useful when dealing with a "package",
// like in the case of a OneNote file. A OneNote file is a
// collection with a package scope and multiple files in it. Most
// other collections have a scope of folder to indicate that the
// files within them belong to a folder.
scope collectionScope
// should only be true if the old delta token expired
doNotMergeItems bool
}
@ -134,6 +145,7 @@ func NewCollection(
statusUpdater support.StatusUpdater,
source driveSource,
ctrlOpts control.Options,
colScope collectionScope,
doNotMergeItems bool,
) *Collection {
c := &Collection{
@ -148,6 +160,7 @@ func NewCollection(
statusUpdater: statusUpdater,
ctrl: ctrlOpts,
state: data.StateOf(prevPath, folderPath),
scope: colScope,
doNotMergeItems: doNotMergeItems,
}
@ -345,12 +358,27 @@ func (oc *Collection) getDriveItemContent(
}
if clues.HasLabel(err, graph.LabelStatus(http.StatusNotFound)) || graph.IsErrDeletedInFlight(err) {
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Error("item not found")
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipNotFound).Info("item not found")
el.AddSkip(fault.FileSkip(fault.SkipNotFound, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable)
}
// Skip big OneNote files as they can't be downloaded
if clues.HasLabel(err, graph.LabelStatus(http.StatusServiceUnavailable)) &&
oc.scope == CollectionScopePackage && *item.GetSize() >= MaxOneNoteFileSize {
// FIXME: It is possible that in case of a OneNote file we
// will end up just backing up the `onetoc2` file without
// the one file which is the important part of the OneNote
// "item". This will have to be handled during the
// restore, or we have to handle it separately by somehow
// deleting the entire collection.
logger.CtxErr(ctx, err).With("skipped_reason", fault.SkipBigOneNote).Info("max OneNote file size exceeded")
el.AddSkip(fault.FileSkip(fault.SkipBigOneNote, itemID, itemName, graph.ItemInfo(item)))
return nil, clues.Wrap(err, "downloading item").Label(graph.LabelsSkippable)
}
logger.CtxErr(ctx, err).Error("downloading item")
el.AddRecoverable(clues.Stack(err).WithClues(ctx).Label(fault.LabelForceNoBackupCreation))

View File

@ -214,6 +214,7 @@ func (suite *CollectionUnitTestSuite) TestCollection() {
suite.testStatusUpdater(&wg, &collStatus),
test.source,
control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}},
CollectionScopeFolder,
true)
require.NotNil(t, coll)
assert.Equal(t, folderPath, coll.FullPath())
@ -353,6 +354,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionReadError() {
suite.testStatusUpdater(&wg, &collStatus),
test.source,
control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}},
CollectionScopeFolder,
true)
mockItem := models.NewDriveItem()
@ -442,6 +444,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionReadUnauthorizedErrorRetry()
suite.testStatusUpdater(&wg, &collStatus),
test.source,
control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}},
CollectionScopeFolder,
true)
mockItem := models.NewDriveItem()
@ -541,6 +544,7 @@ func (suite *CollectionUnitTestSuite) TestCollectionPermissionBackupLatestModTim
suite.testStatusUpdater(&wg, &collStatus),
test.source,
control.Options{ToggleFeatures: control.Toggles{EnablePermissionsBackup: true}},
CollectionScopeFolder,
true)
mtime := time.Now().AddDate(0, -1, 0)
@ -597,3 +601,123 @@ func (suite *CollectionUnitTestSuite) TestCollectionPermissionBackupLatestModTim
})
}
}
type GetDriveItemUnitTestSuite struct {
tester.Suite
}
func TestGetDriveItemUnitTestSuite(t *testing.T) {
suite.Run(t, &GetDriveItemUnitTestSuite{Suite: tester.NewUnitSuite(t)})
}
func (suite *GetDriveItemUnitTestSuite) TestGetDriveItemError() {
strval := "not-important"
table := []struct {
name string
colScope collectionScope
itemSize int64
labels []string
err error
}{
{
name: "Simple item fetch no error",
colScope: CollectionScopeFolder,
itemSize: 10,
err: nil,
},
{
name: "Simple item fetch error",
colScope: CollectionScopeFolder,
itemSize: 10,
err: assert.AnError,
},
{
name: "malware error",
colScope: CollectionScopeFolder,
itemSize: 10,
err: clues.New("test error").Label(graph.LabelsMalware),
labels: []string{graph.LabelsMalware, graph.LabelsSkippable},
},
{
name: "file not found error",
colScope: CollectionScopeFolder,
itemSize: 10,
err: clues.New("test error").Label(graph.LabelStatus(http.StatusNotFound)),
labels: []string{graph.LabelStatus(http.StatusNotFound), graph.LabelsSkippable},
},
{
// This should create an error that stops the backup
name: "small OneNote file",
colScope: CollectionScopePackage,
itemSize: 10,
err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)),
labels: []string{graph.LabelStatus(http.StatusServiceUnavailable)},
},
{
name: "big OneNote file",
colScope: CollectionScopePackage,
itemSize: MaxOneNoteFileSize,
err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)),
labels: []string{graph.LabelStatus(http.StatusServiceUnavailable), graph.LabelsSkippable},
},
{
// This should block backup, only big OneNote files should be a problem
name: "big file",
colScope: CollectionScopeFolder,
itemSize: MaxOneNoteFileSize,
err: clues.New("test error").Label(graph.LabelStatus(http.StatusServiceUnavailable)),
labels: []string{graph.LabelStatus(http.StatusServiceUnavailable)},
},
}
for _, test := range table {
suite.Run(test.name, func() {
ctx, flush := tester.NewContext()
defer flush()
var (
t = suite.T()
errs = fault.New(false)
item = models.NewDriveItem()
col = &Collection{scope: test.colScope}
)
item.SetId(&strval)
item.SetName(&strval)
item.SetSize(&test.itemSize)
col.itemReader = func(
ctx context.Context,
hc *http.Client,
item models.DriveItemable,
) (details.ItemInfo, io.ReadCloser, error) {
return details.ItemInfo{}, nil, test.err
}
col.itemGetter = func(
ctx context.Context,
srv graph.Servicer,
driveID, itemID string,
) (models.DriveItemable, error) {
// We are not testing this err here
return item, nil
}
_, err := col.getDriveItemContent(ctx, item, errs)
if test.err == nil {
assert.NoError(t, err, "no error")
return
}
assert.EqualError(t, err, clues.Wrap(test.err, "downloading item").Error(), "error")
labelsMap := map[string]struct{}{}
for _, l := range test.labels {
labelsMap[l] = struct{}{}
}
assert.Equal(t, labelsMap, clues.Labels(err))
})
}
}

View File

@ -33,6 +33,20 @@ const (
SharePointSource
)
type collectionScope int
const (
// CollectionScopeUnknown is used when we don't know and don't need
// to know the kind, like in the case of deletes
CollectionScopeUnknown collectionScope = iota
// CollectionScopeFolder is used for regular folder collections
CollectionScopeFolder
// CollectionScopePackage is used to represent OneNote items
CollectionScopePackage
)
const (
restrictedDirectory = "Site Pages"
rootDrivePattern = "/drives/%s/root:"
@ -411,6 +425,7 @@ func (c *Collections) Get(
c.statusUpdater,
c.source,
c.ctrl,
CollectionScopeUnknown,
true)
c.CollectionMap[driveID][fldID] = col
@ -572,6 +587,7 @@ func (c *Collections) handleDelete(
c.statusUpdater,
c.source,
c.ctrl,
CollectionScopeUnknown,
// DoNotMerge is not checked for deleted items.
false)
@ -744,6 +760,11 @@ func (c *Collections) UpdateCollections(
continue
}
colScope := CollectionScopeFolder
if item.GetPackage() != nil {
colScope = CollectionScopePackage
}
col := NewCollection(
c.itemClient,
collectionPath,
@ -753,6 +774,7 @@ func (c *Collections) UpdateCollections(
c.statusUpdater,
c.source,
c.ctrl,
colScope,
invalidPrevDelta,
)
col.driveName = driveName

View File

@ -33,4 +33,5 @@ type SkippedCounts struct {
TotalSkippedItems int `json:"totalSkippedItems"`
SkippedMalware int `json:"skippedMalware"`
SkippedNotFound int `json:"skippedNotFound"`
SkippedInvalidOneNoteFile int `json:"skippedInvalidOneNoteFile"`
}

View File

@ -3,6 +3,7 @@ package backup
import (
"context"
"fmt"
"strings"
"time"
"github.com/alcionai/corso/src/cli/print"
@ -78,7 +79,9 @@ func New(
errCount = len(fe.Items)
skipCount = len(fe.Skipped)
failMsg string
malware, notFound, otherSkips int
malware, notFound,
invalidONFile, otherSkips int
)
if fe.Failure != nil {
@ -92,6 +95,8 @@ func New(
malware++
case s.HasCause(fault.SkipNotFound):
notFound++
case s.HasCause(fault.SkipBigOneNote):
invalidONFile++
default:
otherSkips++
}
@ -124,6 +129,7 @@ func New(
TotalSkippedItems: skipCount,
SkippedMalware: malware,
SkippedNotFound: notFound,
SkippedInvalidOneNoteFile: invalidONFile,
},
}
}
@ -211,23 +217,27 @@ func (b Backup) Values() []string {
if b.TotalSkippedItems > 0 {
status += fmt.Sprintf("%d skipped", b.TotalSkippedItems)
if b.SkippedMalware+b.SkippedNotFound > 0 {
if b.SkippedMalware+b.SkippedNotFound+b.SkippedInvalidOneNoteFile > 0 {
status += ": "
}
}
skipped := []string{}
if b.SkippedMalware > 0 {
status += fmt.Sprintf("%d malware", b.SkippedMalware)
if b.SkippedNotFound > 0 {
status += ", "
}
skipped = append(skipped, fmt.Sprintf("%d malware", b.SkippedMalware))
}
if b.SkippedNotFound > 0 {
status += fmt.Sprintf("%d not found", b.SkippedNotFound)
skipped = append(skipped, fmt.Sprintf("%d not found", b.SkippedNotFound))
}
if b.SkippedInvalidOneNoteFile > 0 {
skipped = append(skipped, fmt.Sprintf("%d invalid OneNote file", b.SkippedInvalidOneNoteFile))
}
status += strings.Join(skipped, ", ")
if errCount+b.TotalSkippedItems > 0 {
status += (")")
}

View File

@ -153,7 +153,19 @@ func (suite *BackupUnitSuite) TestBackup_Values_statusVariations() {
expect: "test (42 errors, 1 skipped: 1 not found)",
},
{
name: "errors, malware, notFound",
name: "errors and invalid OneNote",
bup: backup.Backup{
Status: "test",
ErrorCount: 42,
SkippedCounts: stats.SkippedCounts{
TotalSkippedItems: 1,
SkippedInvalidOneNoteFile: 1,
},
},
expect: "test (42 errors, 1 skipped: 1 invalid OneNote file)",
},
{
name: "errors, malware, notFound, invalid OneNote",
bup: backup.Backup{
Status: "test",
ErrorCount: 42,
@ -161,9 +173,10 @@ func (suite *BackupUnitSuite) TestBackup_Values_statusVariations() {
TotalSkippedItems: 1,
SkippedMalware: 1,
SkippedNotFound: 1,
SkippedInvalidOneNoteFile: 1,
},
},
expect: "test (42 errors, 1 skipped: 1 malware, 1 not found)",
expect: "test (42 errors, 1 skipped: 1 malware, 1 not found, 1 invalid OneNote file)",
},
}
for _, test := range table {

View File

@ -156,6 +156,13 @@ const (
// SkipNotFound identifies that a file was skipped because we could
// not find it when trying to download contents
SkipNotFound skipCause = "file_not_found"
// SkipBigOneNote identifies that a file was skipped because it
// was big OneNote file and we can only download OneNote files which
// are less that 2GB in size.
//nolint:lll
// https://support.microsoft.com/en-us/office/restrictions-and-limitations-in-onedrive-and-sharepoint-64883a5d-228e-48f5-b3d2-eb39e07630fa#onenotenotebooks
SkipBigOneNote skipCause = "big_one_note_file"
)
var _ print.Printable = &Skipped{}