Exclude recently created models from garbage collection (#4066)

Exclude models that have been created within the
buffer period from garbage collection/orphaned
checks so that we don't accidentally delete
models for backups that are running concurrently
with the garbage collection task

---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [x] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

- [x] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

* #3217

#### Test Plan

- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-08-21 17:39:24 -07:00 committed by GitHub
parent 99edf7d5b0
commit 11253bf816
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 166 additions and 24 deletions

View File

@ -3,6 +3,7 @@ package kopia
import ( import (
"context" "context"
"errors" "errors"
"time"
"github.com/alcionai/clues" "github.com/alcionai/clues"
"github.com/kopia/kopia/repo/manifest" "github.com/kopia/kopia/repo/manifest"
@ -16,10 +17,37 @@ import (
"github.com/alcionai/corso/src/pkg/store" "github.com/alcionai/corso/src/pkg/store"
) )
// cleanupOrphanedData uses bs and mf to lookup all models/snapshots for backups
// and deletes items that are older than nowFunc() - gcBuffer (cutoff) that are
// not "complete" backups with:
// - a backup model
// - an item data snapshot
// - a details snapshot or details model
//
// We exclude all items younger than the cutoff to add some buffer so that even
// if this is run concurrently with a backup it's not likely to delete models
// just being created. For example, if there was no buffer period and this is
// run when another corso instance has created an item data snapshot but hasn't
// yet created the details snapshot or the backup model it would result in this
// instance of corso marking the newly created item data snapshot for deletion
// because it appears orphaned.
//
// The buffer duration should be longer than the difference in creation times
// between the first item data snapshot/details/backup model made during a
// backup operation and the last.
//
// We don't have hard numbers on the time right now, but if the order of
// persistence is (item data snapshot, details snapshot, backup model) it should
// be faster than creating the snapshot itself and probably happens O(minutes)
// or O(hours) instead of O(days). Of course, that assumes a non-adversarial
// setup where things such as machine hiberation, process freezing (i.e. paused
// at the OS level), etc. don't occur.
func cleanupOrphanedData( func cleanupOrphanedData(
ctx context.Context, ctx context.Context,
bs store.Storer, bs store.Storer,
mf manifestFinder, mf manifestFinder,
gcBuffer time.Duration,
nowFunc func() time.Time,
) error { ) error {
// Get all snapshot manifests. // Get all snapshot manifests.
snaps, err := mf.FindManifests( snaps, err := mf.FindManifests(
@ -43,27 +71,16 @@ func cleanupOrphanedData(
dataSnaps = map[manifest.ID]struct{}{} dataSnaps = map[manifest.ID]struct{}{}
) )
// TODO(ashmrtn): Exclude all snapshots and details younger than X <duration>. cutoff := nowFunc().Add(-gcBuffer)
// Doing so adds some buffer so that even if this is run concurrently with a
// backup it's not likely to delete models just being created. For example,
// running this when another corso instance has created an item data snapshot
// but hasn't yet created the details snapshot or the backup model would
// result in this instance of corso marking the newly created item data
// snapshot for deletion because it appears orphaned.
//
// Excluding only snapshots and details models works for now since the backup
// model is the last thing persisted out of them. If we switch the order of
// persistence then this will need updated as well.
//
// The buffer duration should be longer than the time it would take to do
// details merging and backup model creation. We don't have hard numbers on
// that, but it should be faster than creating the snapshot itself and
// probably happens O(minutes) or O(hours) instead of O(days). Of course, that
// assumes a non-adversarial setup where things such as machine hiberation,
// process freezing (i.e. paused at the OS level), etc. don't occur.
// Sort all the snapshots as either details snapshots or item data snapshots. // Sort all the snapshots as either details snapshots or item data snapshots.
for _, snap := range snaps { for _, snap := range snaps {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(snap.ModTime) {
continue
}
k, _ := makeTagKV(TagBackupCategory) k, _ := makeTagKV(TagBackupCategory)
if _, ok := snap.Labels[k]; ok { if _, ok := snap.Labels[k]; ok {
dataSnaps[snap.ID] = struct{}{} dataSnaps[snap.ID] = struct{}{}
@ -82,6 +99,12 @@ func cleanupOrphanedData(
} }
for _, d := range deetsModels { for _, d := range deetsModels {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(d.ModTime) {
continue
}
deets[d.ModelStoreID] = struct{}{} deets[d.ModelStoreID] = struct{}{}
} }
@ -95,6 +118,12 @@ func cleanupOrphanedData(
maps.Copy(toDelete, dataSnaps) maps.Copy(toDelete, dataSnaps)
for _, bup := range bups { for _, bup := range bups {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(bup.ModTime) {
continue
}
toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{} toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{}
bm := backup.Backup{} bm := backup.Backup{}
@ -110,12 +139,13 @@ func cleanupOrphanedData(
With("search_backup_id", bup.ID) With("search_backup_id", bup.ID)
} }
// TODO(ashmrtn): This actually needs revised, see above TODO. Leaving it // Probably safe to continue if the model wasn't found because that means
// here for the moment to get the basic logic in. // that the possible item data and details for the backup are now
// orphaned. They'll be deleted since we won't remove them from the delete
// set.
// //
// Safe to continue if the model wasn't found because that means that the // The fact that we exclude all items younger than the cutoff should
// possible item data and details for the backup are now orphaned. They'll // already exclude items that are from concurrent corso backup operations.
// be deleted since we won't remove them from the delete set.
// //
// This isn't expected to really pop up, but it's possible if this // This isn't expected to really pop up, but it's possible if this
// function is run concurrently with either a backup delete or another // function is run concurrently with either a backup delete or another
@ -143,6 +173,11 @@ func cleanupOrphanedData(
} }
} }
logger.Ctx(ctx).Infow(
"garbage collecting orphaned items",
"num_items", len(toDelete),
"kopia_ids", maps.Keys(toDelete))
// Use single atomic batch delete operation to cleanup to keep from making a // Use single atomic batch delete operation to cleanup to keep from making a
// bunch of manifest content blobs. // bunch of manifest content blobs.
if err := bs.DeleteWithModelStoreIDs(ctx, maps.Keys(toDelete)...); err != nil { if err := bs.DeleteWithModelStoreIDs(ctx, maps.Keys(toDelete)...); err != nil {

View File

@ -4,6 +4,7 @@ import (
"context" "context"
"fmt" "fmt"
"testing" "testing"
"time"
"github.com/alcionai/clues" "github.com/alcionai/clues"
"github.com/kopia/kopia/repo/manifest" "github.com/kopia/kopia/repo/manifest"
@ -221,6 +222,28 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
}, },
} }
// Get some stable time so that we can do everything relative to this in the
// tests. Mostly just makes reasoning/viewing times easier because the only
// differences will be the changes we make.
baseTime := time.Now()
manifestWithTime := func(
mt time.Time,
m *manifest.EntryMetadata,
) *manifest.EntryMetadata {
res := *m
res.ModTime = mt
return &res
}
backupWithTime := func(mt time.Time, b *backup.Backup) *backup.Backup {
res := *b
res.ModTime = mt
return &res
}
table := []struct { table := []struct {
name string name string
snapshots []*manifest.EntryMetadata snapshots []*manifest.EntryMetadata
@ -231,12 +254,15 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
backups []backupRes backups []backupRes
backupListErr error backupListErr error
deleteErr error deleteErr error
time time.Time
buffer time.Duration
expectDeleteIDs []manifest.ID expectDeleteIDs []manifest.ID
expectErr assert.ErrorAssertionFunc expectErr assert.ErrorAssertionFunc
}{ }{
{ {
name: "EmptyRepo", name: "EmptyRepo",
time: baseTime,
expectErr: assert.NoError, expectErr: assert.NoError,
}, },
{ {
@ -253,6 +279,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
{bup: bupCurrent}, {bup: bupCurrent},
{bup: bupLegacy}, {bup: bupLegacy},
}, },
time: baseTime,
expectErr: assert.NoError, expectErr: assert.NoError,
}, },
{ {
@ -277,6 +304,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
snapNoDetails.ID, snapNoDetails.ID,
deetsNoSnapshot.ID, deetsNoSnapshot.ID,
}, },
time: baseTime,
expectErr: assert.NoError, expectErr: assert.NoError,
}, },
{ {
@ -297,6 +325,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifest.ID(bupLegacy.ModelStoreID), manifest.ID(bupLegacy.ModelStoreID),
manifest.ID(deetsLegacy.ModelStoreID), manifest.ID(deetsLegacy.ModelStoreID),
}, },
time: baseTime,
expectErr: assert.NoError, expectErr: assert.NoError,
}, },
{ {
@ -315,6 +344,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
snapCurrent.ID, snapCurrent.ID,
snapLegacy.ID, snapLegacy.ID,
}, },
time: baseTime,
expectErr: assert.NoError, expectErr: assert.NoError,
}, },
{ {
@ -334,6 +364,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
backups: []backupRes{ backups: []backupRes{
{bup: bupCurrent}, {bup: bupCurrent},
}, },
time: baseTime,
expectErr: assert.Error, expectErr: assert.Error,
}, },
{ {
@ -343,6 +374,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
deetsCurrent, deetsCurrent,
}, },
backupListErr: assert.AnError, backupListErr: assert.AnError,
time: baseTime,
expectErr: assert.Error, expectErr: assert.Error,
}, },
{ {
@ -378,6 +410,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
snapNoDetails.ID, snapNoDetails.ID,
manifest.ID(bupNoDetails.ModelStoreID), manifest.ID(bupNoDetails.ModelStoreID),
}, },
time: baseTime,
expectErr: assert.NoError, expectErr: assert.NoError,
}, },
{ {
@ -399,8 +432,77 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
}, },
{bup: bupNoDetails}, {bup: bupNoDetails},
}, },
time: baseTime,
expectErr: assert.Error, expectErr: assert.Error,
}, },
{
name: "DeleteError Fails",
snapshots: []*manifest.EntryMetadata{
snapCurrent,
deetsCurrent,
snapLegacy,
snapNoDetails,
},
detailsModels: []*model.BaseModel{
deetsLegacy,
},
backups: []backupRes{
{bup: bupCurrent},
{bup: bupLegacy},
{bup: bupNoDetails},
},
expectDeleteIDs: []manifest.ID{
snapNoDetails.ID,
manifest.ID(bupNoDetails.ModelStoreID),
},
deleteErr: assert.AnError,
time: baseTime,
expectErr: assert.Error,
},
{
name: "MissingSnapshot BarelyTooYoungForCleanup Noops",
snapshots: []*manifest.EntryMetadata{
manifestWithTime(baseTime, deetsCurrent),
},
backups: []backupRes{
{bup: backupWithTime(baseTime, bupCurrent)},
},
time: baseTime.Add(24 * time.Hour),
buffer: 24 * time.Hour,
expectErr: assert.NoError,
},
{
name: "MissingSnapshot BarelyOldEnough CausesCleanup",
snapshots: []*manifest.EntryMetadata{
manifestWithTime(baseTime, deetsCurrent),
},
backups: []backupRes{
{bup: backupWithTime(baseTime, bupCurrent)},
},
expectDeleteIDs: []manifest.ID{
deetsCurrent.ID,
manifest.ID(bupCurrent.ModelStoreID),
},
time: baseTime.Add((24 * time.Hour) + time.Second),
buffer: 24 * time.Hour,
expectErr: assert.NoError,
},
{
name: "BackupGetErrorNotFound TooYoung Noops",
snapshots: []*manifest.EntryMetadata{
manifestWithTime(baseTime, snapCurrent),
manifestWithTime(baseTime, deetsCurrent),
},
backups: []backupRes{
{
bup: backupWithTime(baseTime, bupCurrent),
err: data.ErrNotFound,
},
},
time: baseTime,
buffer: 24 * time.Hour,
expectErr: assert.NoError,
},
} }
for _, test := range table { for _, test := range table {
@ -426,7 +528,12 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
err: test.snapshotFetchErr, err: test.snapshotFetchErr,
} }
err := cleanupOrphanedData(ctx, mbs, mmf) err := cleanupOrphanedData(
ctx,
mbs,
mmf,
test.buffer,
func() time.Time { return test.time })
test.expectErr(t, err, clues.ToCore(err)) test.expectErr(t, err, clues.ToCore(err))
}) })
} }