diff --git a/src/internal/kopia/cleanup_backups.go b/src/internal/kopia/cleanup_backups.go index b431b7a91..82ae04dc4 100644 --- a/src/internal/kopia/cleanup_backups.go +++ b/src/internal/kopia/cleanup_backups.go @@ -3,6 +3,7 @@ package kopia import ( "context" "errors" + "time" "github.com/alcionai/clues" "github.com/kopia/kopia/repo/manifest" @@ -16,10 +17,37 @@ import ( "github.com/alcionai/corso/src/pkg/store" ) +// cleanupOrphanedData uses bs and mf to lookup all models/snapshots for backups +// and deletes items that are older than nowFunc() - gcBuffer (cutoff) that are +// not "complete" backups with: +// - a backup model +// - an item data snapshot +// - a details snapshot or details model +// +// We exclude all items younger than the cutoff to add some buffer so that even +// if this is run concurrently with a backup it's not likely to delete models +// just being created. For example, if there was no buffer period and this is +// run when another corso instance has created an item data snapshot but hasn't +// yet created the details snapshot or the backup model it would result in this +// instance of corso marking the newly created item data snapshot for deletion +// because it appears orphaned. +// +// The buffer duration should be longer than the difference in creation times +// between the first item data snapshot/details/backup model made during a +// backup operation and the last. +// +// We don't have hard numbers on the time right now, but if the order of +// persistence is (item data snapshot, details snapshot, backup model) it should +// be faster than creating the snapshot itself and probably happens O(minutes) +// or O(hours) instead of O(days). Of course, that assumes a non-adversarial +// setup where things such as machine hiberation, process freezing (i.e. paused +// at the OS level), etc. don't occur. func cleanupOrphanedData( ctx context.Context, bs store.Storer, mf manifestFinder, + gcBuffer time.Duration, + nowFunc func() time.Time, ) error { // Get all snapshot manifests. snaps, err := mf.FindManifests( @@ -43,27 +71,16 @@ func cleanupOrphanedData( dataSnaps = map[manifest.ID]struct{}{} ) - // TODO(ashmrtn): Exclude all snapshots and details younger than X . - // Doing so adds some buffer so that even if this is run concurrently with a - // backup it's not likely to delete models just being created. For example, - // running this when another corso instance has created an item data snapshot - // but hasn't yet created the details snapshot or the backup model would - // result in this instance of corso marking the newly created item data - // snapshot for deletion because it appears orphaned. - // - // Excluding only snapshots and details models works for now since the backup - // model is the last thing persisted out of them. If we switch the order of - // persistence then this will need updated as well. - // - // The buffer duration should be longer than the time it would take to do - // details merging and backup model creation. We don't have hard numbers on - // that, but it should be faster than creating the snapshot itself and - // probably happens O(minutes) or O(hours) instead of O(days). Of course, that - // assumes a non-adversarial setup where things such as machine hiberation, - // process freezing (i.e. paused at the OS level), etc. don't occur. + cutoff := nowFunc().Add(-gcBuffer) // Sort all the snapshots as either details snapshots or item data snapshots. for _, snap := range snaps { + // Don't even try to see if this needs garbage collected because it's not + // old enough and may correspond to an in-progress operation. + if !cutoff.After(snap.ModTime) { + continue + } + k, _ := makeTagKV(TagBackupCategory) if _, ok := snap.Labels[k]; ok { dataSnaps[snap.ID] = struct{}{} @@ -82,6 +99,12 @@ func cleanupOrphanedData( } for _, d := range deetsModels { + // Don't even try to see if this needs garbage collected because it's not + // old enough and may correspond to an in-progress operation. + if !cutoff.After(d.ModTime) { + continue + } + deets[d.ModelStoreID] = struct{}{} } @@ -95,6 +118,12 @@ func cleanupOrphanedData( maps.Copy(toDelete, dataSnaps) for _, bup := range bups { + // Don't even try to see if this needs garbage collected because it's not + // old enough and may correspond to an in-progress operation. + if !cutoff.After(bup.ModTime) { + continue + } + toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{} bm := backup.Backup{} @@ -110,12 +139,13 @@ func cleanupOrphanedData( With("search_backup_id", bup.ID) } - // TODO(ashmrtn): This actually needs revised, see above TODO. Leaving it - // here for the moment to get the basic logic in. + // Probably safe to continue if the model wasn't found because that means + // that the possible item data and details for the backup are now + // orphaned. They'll be deleted since we won't remove them from the delete + // set. // - // Safe to continue if the model wasn't found because that means that the - // possible item data and details for the backup are now orphaned. They'll - // be deleted since we won't remove them from the delete set. + // The fact that we exclude all items younger than the cutoff should + // already exclude items that are from concurrent corso backup operations. // // This isn't expected to really pop up, but it's possible if this // function is run concurrently with either a backup delete or another @@ -143,6 +173,11 @@ func cleanupOrphanedData( } } + logger.Ctx(ctx).Infow( + "garbage collecting orphaned items", + "num_items", len(toDelete), + "kopia_ids", maps.Keys(toDelete)) + // Use single atomic batch delete operation to cleanup to keep from making a // bunch of manifest content blobs. if err := bs.DeleteWithModelStoreIDs(ctx, maps.Keys(toDelete)...); err != nil { diff --git a/src/internal/kopia/cleanup_backups_test.go b/src/internal/kopia/cleanup_backups_test.go index 78bc6a164..895d9226e 100644 --- a/src/internal/kopia/cleanup_backups_test.go +++ b/src/internal/kopia/cleanup_backups_test.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "testing" + "time" "github.com/alcionai/clues" "github.com/kopia/kopia/repo/manifest" @@ -221,6 +222,28 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { }, } + // Get some stable time so that we can do everything relative to this in the + // tests. Mostly just makes reasoning/viewing times easier because the only + // differences will be the changes we make. + baseTime := time.Now() + + manifestWithTime := func( + mt time.Time, + m *manifest.EntryMetadata, + ) *manifest.EntryMetadata { + res := *m + res.ModTime = mt + + return &res + } + + backupWithTime := func(mt time.Time, b *backup.Backup) *backup.Backup { + res := *b + res.ModTime = mt + + return &res + } + table := []struct { name string snapshots []*manifest.EntryMetadata @@ -231,12 +254,15 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { backups []backupRes backupListErr error deleteErr error + time time.Time + buffer time.Duration expectDeleteIDs []manifest.ID expectErr assert.ErrorAssertionFunc }{ { name: "EmptyRepo", + time: baseTime, expectErr: assert.NoError, }, { @@ -253,6 +279,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { {bup: bupCurrent}, {bup: bupLegacy}, }, + time: baseTime, expectErr: assert.NoError, }, { @@ -277,6 +304,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { snapNoDetails.ID, deetsNoSnapshot.ID, }, + time: baseTime, expectErr: assert.NoError, }, { @@ -297,6 +325,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { manifest.ID(bupLegacy.ModelStoreID), manifest.ID(deetsLegacy.ModelStoreID), }, + time: baseTime, expectErr: assert.NoError, }, { @@ -315,6 +344,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { snapCurrent.ID, snapLegacy.ID, }, + time: baseTime, expectErr: assert.NoError, }, { @@ -334,6 +364,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { backups: []backupRes{ {bup: bupCurrent}, }, + time: baseTime, expectErr: assert.Error, }, { @@ -343,6 +374,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { deetsCurrent, }, backupListErr: assert.AnError, + time: baseTime, expectErr: assert.Error, }, { @@ -378,6 +410,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { snapNoDetails.ID, manifest.ID(bupNoDetails.ModelStoreID), }, + time: baseTime, expectErr: assert.NoError, }, { @@ -399,8 +432,77 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { }, {bup: bupNoDetails}, }, + time: baseTime, expectErr: assert.Error, }, + { + name: "DeleteError Fails", + snapshots: []*manifest.EntryMetadata{ + snapCurrent, + deetsCurrent, + snapLegacy, + snapNoDetails, + }, + detailsModels: []*model.BaseModel{ + deetsLegacy, + }, + backups: []backupRes{ + {bup: bupCurrent}, + {bup: bupLegacy}, + {bup: bupNoDetails}, + }, + expectDeleteIDs: []manifest.ID{ + snapNoDetails.ID, + manifest.ID(bupNoDetails.ModelStoreID), + }, + deleteErr: assert.AnError, + time: baseTime, + expectErr: assert.Error, + }, + { + name: "MissingSnapshot BarelyTooYoungForCleanup Noops", + snapshots: []*manifest.EntryMetadata{ + manifestWithTime(baseTime, deetsCurrent), + }, + backups: []backupRes{ + {bup: backupWithTime(baseTime, bupCurrent)}, + }, + time: baseTime.Add(24 * time.Hour), + buffer: 24 * time.Hour, + expectErr: assert.NoError, + }, + { + name: "MissingSnapshot BarelyOldEnough CausesCleanup", + snapshots: []*manifest.EntryMetadata{ + manifestWithTime(baseTime, deetsCurrent), + }, + backups: []backupRes{ + {bup: backupWithTime(baseTime, bupCurrent)}, + }, + expectDeleteIDs: []manifest.ID{ + deetsCurrent.ID, + manifest.ID(bupCurrent.ModelStoreID), + }, + time: baseTime.Add((24 * time.Hour) + time.Second), + buffer: 24 * time.Hour, + expectErr: assert.NoError, + }, + { + name: "BackupGetErrorNotFound TooYoung Noops", + snapshots: []*manifest.EntryMetadata{ + manifestWithTime(baseTime, snapCurrent), + manifestWithTime(baseTime, deetsCurrent), + }, + backups: []backupRes{ + { + bup: backupWithTime(baseTime, bupCurrent), + err: data.ErrNotFound, + }, + }, + time: baseTime, + buffer: 24 * time.Hour, + expectErr: assert.NoError, + }, } for _, test := range table { @@ -426,7 +528,12 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() { err: test.snapshotFetchErr, } - err := cleanupOrphanedData(ctx, mbs, mmf) + err := cleanupOrphanedData( + ctx, + mbs, + mmf, + test.buffer, + func() time.Time { return test.time }) test.expectErr(t, err, clues.ToCore(err)) }) }