Exclude recently created models from garbage collection (#4066)

Exclude models that have been created within the
buffer period from garbage collection/orphaned
checks so that we don't accidentally delete
models for backups that are running concurrently
with the garbage collection task

---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [x] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

- [x] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

* #3217

#### Test Plan

- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-08-21 17:39:24 -07:00 committed by GitHub
parent 99edf7d5b0
commit 11253bf816
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 166 additions and 24 deletions

View File

@ -3,6 +3,7 @@ package kopia
import (
"context"
"errors"
"time"
"github.com/alcionai/clues"
"github.com/kopia/kopia/repo/manifest"
@ -16,10 +17,37 @@ import (
"github.com/alcionai/corso/src/pkg/store"
)
// cleanupOrphanedData uses bs and mf to lookup all models/snapshots for backups
// and deletes items that are older than nowFunc() - gcBuffer (cutoff) that are
// not "complete" backups with:
// - a backup model
// - an item data snapshot
// - a details snapshot or details model
//
// We exclude all items younger than the cutoff to add some buffer so that even
// if this is run concurrently with a backup it's not likely to delete models
// just being created. For example, if there was no buffer period and this is
// run when another corso instance has created an item data snapshot but hasn't
// yet created the details snapshot or the backup model it would result in this
// instance of corso marking the newly created item data snapshot for deletion
// because it appears orphaned.
//
// The buffer duration should be longer than the difference in creation times
// between the first item data snapshot/details/backup model made during a
// backup operation and the last.
//
// We don't have hard numbers on the time right now, but if the order of
// persistence is (item data snapshot, details snapshot, backup model) it should
// be faster than creating the snapshot itself and probably happens O(minutes)
// or O(hours) instead of O(days). Of course, that assumes a non-adversarial
// setup where things such as machine hiberation, process freezing (i.e. paused
// at the OS level), etc. don't occur.
func cleanupOrphanedData(
ctx context.Context,
bs store.Storer,
mf manifestFinder,
gcBuffer time.Duration,
nowFunc func() time.Time,
) error {
// Get all snapshot manifests.
snaps, err := mf.FindManifests(
@ -43,27 +71,16 @@ func cleanupOrphanedData(
dataSnaps = map[manifest.ID]struct{}{}
)
// TODO(ashmrtn): Exclude all snapshots and details younger than X <duration>.
// Doing so adds some buffer so that even if this is run concurrently with a
// backup it's not likely to delete models just being created. For example,
// running this when another corso instance has created an item data snapshot
// but hasn't yet created the details snapshot or the backup model would
// result in this instance of corso marking the newly created item data
// snapshot for deletion because it appears orphaned.
//
// Excluding only snapshots and details models works for now since the backup
// model is the last thing persisted out of them. If we switch the order of
// persistence then this will need updated as well.
//
// The buffer duration should be longer than the time it would take to do
// details merging and backup model creation. We don't have hard numbers on
// that, but it should be faster than creating the snapshot itself and
// probably happens O(minutes) or O(hours) instead of O(days). Of course, that
// assumes a non-adversarial setup where things such as machine hiberation,
// process freezing (i.e. paused at the OS level), etc. don't occur.
cutoff := nowFunc().Add(-gcBuffer)
// Sort all the snapshots as either details snapshots or item data snapshots.
for _, snap := range snaps {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(snap.ModTime) {
continue
}
k, _ := makeTagKV(TagBackupCategory)
if _, ok := snap.Labels[k]; ok {
dataSnaps[snap.ID] = struct{}{}
@ -82,6 +99,12 @@ func cleanupOrphanedData(
}
for _, d := range deetsModels {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(d.ModTime) {
continue
}
deets[d.ModelStoreID] = struct{}{}
}
@ -95,6 +118,12 @@ func cleanupOrphanedData(
maps.Copy(toDelete, dataSnaps)
for _, bup := range bups {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(bup.ModTime) {
continue
}
toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{}
bm := backup.Backup{}
@ -110,12 +139,13 @@ func cleanupOrphanedData(
With("search_backup_id", bup.ID)
}
// TODO(ashmrtn): This actually needs revised, see above TODO. Leaving it
// here for the moment to get the basic logic in.
// Probably safe to continue if the model wasn't found because that means
// that the possible item data and details for the backup are now
// orphaned. They'll be deleted since we won't remove them from the delete
// set.
//
// Safe to continue if the model wasn't found because that means that the
// possible item data and details for the backup are now orphaned. They'll
// be deleted since we won't remove them from the delete set.
// The fact that we exclude all items younger than the cutoff should
// already exclude items that are from concurrent corso backup operations.
//
// This isn't expected to really pop up, but it's possible if this
// function is run concurrently with either a backup delete or another
@ -143,6 +173,11 @@ func cleanupOrphanedData(
}
}
logger.Ctx(ctx).Infow(
"garbage collecting orphaned items",
"num_items", len(toDelete),
"kopia_ids", maps.Keys(toDelete))
// Use single atomic batch delete operation to cleanup to keep from making a
// bunch of manifest content blobs.
if err := bs.DeleteWithModelStoreIDs(ctx, maps.Keys(toDelete)...); err != nil {

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"testing"
"time"
"github.com/alcionai/clues"
"github.com/kopia/kopia/repo/manifest"
@ -221,6 +222,28 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
},
}
// Get some stable time so that we can do everything relative to this in the
// tests. Mostly just makes reasoning/viewing times easier because the only
// differences will be the changes we make.
baseTime := time.Now()
manifestWithTime := func(
mt time.Time,
m *manifest.EntryMetadata,
) *manifest.EntryMetadata {
res := *m
res.ModTime = mt
return &res
}
backupWithTime := func(mt time.Time, b *backup.Backup) *backup.Backup {
res := *b
res.ModTime = mt
return &res
}
table := []struct {
name string
snapshots []*manifest.EntryMetadata
@ -231,12 +254,15 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
backups []backupRes
backupListErr error
deleteErr error
time time.Time
buffer time.Duration
expectDeleteIDs []manifest.ID
expectErr assert.ErrorAssertionFunc
}{
{
name: "EmptyRepo",
time: baseTime,
expectErr: assert.NoError,
},
{
@ -253,6 +279,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
{bup: bupCurrent},
{bup: bupLegacy},
},
time: baseTime,
expectErr: assert.NoError,
},
{
@ -277,6 +304,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
snapNoDetails.ID,
deetsNoSnapshot.ID,
},
time: baseTime,
expectErr: assert.NoError,
},
{
@ -297,6 +325,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifest.ID(bupLegacy.ModelStoreID),
manifest.ID(deetsLegacy.ModelStoreID),
},
time: baseTime,
expectErr: assert.NoError,
},
{
@ -315,6 +344,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
snapCurrent.ID,
snapLegacy.ID,
},
time: baseTime,
expectErr: assert.NoError,
},
{
@ -334,6 +364,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
backups: []backupRes{
{bup: bupCurrent},
},
time: baseTime,
expectErr: assert.Error,
},
{
@ -343,6 +374,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
deetsCurrent,
},
backupListErr: assert.AnError,
time: baseTime,
expectErr: assert.Error,
},
{
@ -378,6 +410,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
snapNoDetails.ID,
manifest.ID(bupNoDetails.ModelStoreID),
},
time: baseTime,
expectErr: assert.NoError,
},
{
@ -399,8 +432,77 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
},
{bup: bupNoDetails},
},
time: baseTime,
expectErr: assert.Error,
},
{
name: "DeleteError Fails",
snapshots: []*manifest.EntryMetadata{
snapCurrent,
deetsCurrent,
snapLegacy,
snapNoDetails,
},
detailsModels: []*model.BaseModel{
deetsLegacy,
},
backups: []backupRes{
{bup: bupCurrent},
{bup: bupLegacy},
{bup: bupNoDetails},
},
expectDeleteIDs: []manifest.ID{
snapNoDetails.ID,
manifest.ID(bupNoDetails.ModelStoreID),
},
deleteErr: assert.AnError,
time: baseTime,
expectErr: assert.Error,
},
{
name: "MissingSnapshot BarelyTooYoungForCleanup Noops",
snapshots: []*manifest.EntryMetadata{
manifestWithTime(baseTime, deetsCurrent),
},
backups: []backupRes{
{bup: backupWithTime(baseTime, bupCurrent)},
},
time: baseTime.Add(24 * time.Hour),
buffer: 24 * time.Hour,
expectErr: assert.NoError,
},
{
name: "MissingSnapshot BarelyOldEnough CausesCleanup",
snapshots: []*manifest.EntryMetadata{
manifestWithTime(baseTime, deetsCurrent),
},
backups: []backupRes{
{bup: backupWithTime(baseTime, bupCurrent)},
},
expectDeleteIDs: []manifest.ID{
deetsCurrent.ID,
manifest.ID(bupCurrent.ModelStoreID),
},
time: baseTime.Add((24 * time.Hour) + time.Second),
buffer: 24 * time.Hour,
expectErr: assert.NoError,
},
{
name: "BackupGetErrorNotFound TooYoung Noops",
snapshots: []*manifest.EntryMetadata{
manifestWithTime(baseTime, snapCurrent),
manifestWithTime(baseTime, deetsCurrent),
},
backups: []backupRes{
{
bup: backupWithTime(baseTime, bupCurrent),
err: data.ErrNotFound,
},
},
time: baseTime,
buffer: 24 * time.Hour,
expectErr: assert.NoError,
},
}
for _, test := range table {
@ -426,7 +528,12 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
err: test.snapshotFetchErr,
}
err := cleanupOrphanedData(ctx, mbs, mmf)
err := cleanupOrphanedData(
ctx,
mbs,
mmf,
test.buffer,
func() time.Time { return test.time })
test.expectErr(t, err, clues.ToCore(err))
})
}