Exclude recently created models from garbage collection (#4066)
Exclude models that have been created within the buffer period from garbage collection/orphaned checks so that we don't accidentally delete models for backups that are running concurrently with the garbage collection task --- #### Does this PR need a docs update or release note? - [ ] ✅ Yes, it's included - [x] 🕐 Yes, but in a later PR - [ ] ⛔ No #### Type of change - [x] 🌻 Feature - [ ] 🐛 Bugfix - [ ] 🗺️ Documentation - [ ] 🤖 Supportability/Tests - [ ] 💻 CI/Deployment - [ ] 🧹 Tech Debt/Cleanup #### Issue(s) * #3217 #### Test Plan - [ ] 💪 Manual - [x] ⚡ Unit test - [ ] 💚 E2E
This commit is contained in:
parent
99edf7d5b0
commit
11253bf816
@ -3,6 +3,7 @@ package kopia
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"time"
|
||||
|
||||
"github.com/alcionai/clues"
|
||||
"github.com/kopia/kopia/repo/manifest"
|
||||
@ -16,10 +17,37 @@ import (
|
||||
"github.com/alcionai/corso/src/pkg/store"
|
||||
)
|
||||
|
||||
// cleanupOrphanedData uses bs and mf to lookup all models/snapshots for backups
|
||||
// and deletes items that are older than nowFunc() - gcBuffer (cutoff) that are
|
||||
// not "complete" backups with:
|
||||
// - a backup model
|
||||
// - an item data snapshot
|
||||
// - a details snapshot or details model
|
||||
//
|
||||
// We exclude all items younger than the cutoff to add some buffer so that even
|
||||
// if this is run concurrently with a backup it's not likely to delete models
|
||||
// just being created. For example, if there was no buffer period and this is
|
||||
// run when another corso instance has created an item data snapshot but hasn't
|
||||
// yet created the details snapshot or the backup model it would result in this
|
||||
// instance of corso marking the newly created item data snapshot for deletion
|
||||
// because it appears orphaned.
|
||||
//
|
||||
// The buffer duration should be longer than the difference in creation times
|
||||
// between the first item data snapshot/details/backup model made during a
|
||||
// backup operation and the last.
|
||||
//
|
||||
// We don't have hard numbers on the time right now, but if the order of
|
||||
// persistence is (item data snapshot, details snapshot, backup model) it should
|
||||
// be faster than creating the snapshot itself and probably happens O(minutes)
|
||||
// or O(hours) instead of O(days). Of course, that assumes a non-adversarial
|
||||
// setup where things such as machine hiberation, process freezing (i.e. paused
|
||||
// at the OS level), etc. don't occur.
|
||||
func cleanupOrphanedData(
|
||||
ctx context.Context,
|
||||
bs store.Storer,
|
||||
mf manifestFinder,
|
||||
gcBuffer time.Duration,
|
||||
nowFunc func() time.Time,
|
||||
) error {
|
||||
// Get all snapshot manifests.
|
||||
snaps, err := mf.FindManifests(
|
||||
@ -43,27 +71,16 @@ func cleanupOrphanedData(
|
||||
dataSnaps = map[manifest.ID]struct{}{}
|
||||
)
|
||||
|
||||
// TODO(ashmrtn): Exclude all snapshots and details younger than X <duration>.
|
||||
// Doing so adds some buffer so that even if this is run concurrently with a
|
||||
// backup it's not likely to delete models just being created. For example,
|
||||
// running this when another corso instance has created an item data snapshot
|
||||
// but hasn't yet created the details snapshot or the backup model would
|
||||
// result in this instance of corso marking the newly created item data
|
||||
// snapshot for deletion because it appears orphaned.
|
||||
//
|
||||
// Excluding only snapshots and details models works for now since the backup
|
||||
// model is the last thing persisted out of them. If we switch the order of
|
||||
// persistence then this will need updated as well.
|
||||
//
|
||||
// The buffer duration should be longer than the time it would take to do
|
||||
// details merging and backup model creation. We don't have hard numbers on
|
||||
// that, but it should be faster than creating the snapshot itself and
|
||||
// probably happens O(minutes) or O(hours) instead of O(days). Of course, that
|
||||
// assumes a non-adversarial setup where things such as machine hiberation,
|
||||
// process freezing (i.e. paused at the OS level), etc. don't occur.
|
||||
cutoff := nowFunc().Add(-gcBuffer)
|
||||
|
||||
// Sort all the snapshots as either details snapshots or item data snapshots.
|
||||
for _, snap := range snaps {
|
||||
// Don't even try to see if this needs garbage collected because it's not
|
||||
// old enough and may correspond to an in-progress operation.
|
||||
if !cutoff.After(snap.ModTime) {
|
||||
continue
|
||||
}
|
||||
|
||||
k, _ := makeTagKV(TagBackupCategory)
|
||||
if _, ok := snap.Labels[k]; ok {
|
||||
dataSnaps[snap.ID] = struct{}{}
|
||||
@ -82,6 +99,12 @@ func cleanupOrphanedData(
|
||||
}
|
||||
|
||||
for _, d := range deetsModels {
|
||||
// Don't even try to see if this needs garbage collected because it's not
|
||||
// old enough and may correspond to an in-progress operation.
|
||||
if !cutoff.After(d.ModTime) {
|
||||
continue
|
||||
}
|
||||
|
||||
deets[d.ModelStoreID] = struct{}{}
|
||||
}
|
||||
|
||||
@ -95,6 +118,12 @@ func cleanupOrphanedData(
|
||||
maps.Copy(toDelete, dataSnaps)
|
||||
|
||||
for _, bup := range bups {
|
||||
// Don't even try to see if this needs garbage collected because it's not
|
||||
// old enough and may correspond to an in-progress operation.
|
||||
if !cutoff.After(bup.ModTime) {
|
||||
continue
|
||||
}
|
||||
|
||||
toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{}
|
||||
|
||||
bm := backup.Backup{}
|
||||
@ -110,12 +139,13 @@ func cleanupOrphanedData(
|
||||
With("search_backup_id", bup.ID)
|
||||
}
|
||||
|
||||
// TODO(ashmrtn): This actually needs revised, see above TODO. Leaving it
|
||||
// here for the moment to get the basic logic in.
|
||||
// Probably safe to continue if the model wasn't found because that means
|
||||
// that the possible item data and details for the backup are now
|
||||
// orphaned. They'll be deleted since we won't remove them from the delete
|
||||
// set.
|
||||
//
|
||||
// Safe to continue if the model wasn't found because that means that the
|
||||
// possible item data and details for the backup are now orphaned. They'll
|
||||
// be deleted since we won't remove them from the delete set.
|
||||
// The fact that we exclude all items younger than the cutoff should
|
||||
// already exclude items that are from concurrent corso backup operations.
|
||||
//
|
||||
// This isn't expected to really pop up, but it's possible if this
|
||||
// function is run concurrently with either a backup delete or another
|
||||
@ -143,6 +173,11 @@ func cleanupOrphanedData(
|
||||
}
|
||||
}
|
||||
|
||||
logger.Ctx(ctx).Infow(
|
||||
"garbage collecting orphaned items",
|
||||
"num_items", len(toDelete),
|
||||
"kopia_ids", maps.Keys(toDelete))
|
||||
|
||||
// Use single atomic batch delete operation to cleanup to keep from making a
|
||||
// bunch of manifest content blobs.
|
||||
if err := bs.DeleteWithModelStoreIDs(ctx, maps.Keys(toDelete)...); err != nil {
|
||||
|
||||
@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alcionai/clues"
|
||||
"github.com/kopia/kopia/repo/manifest"
|
||||
@ -221,6 +222,28 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
},
|
||||
}
|
||||
|
||||
// Get some stable time so that we can do everything relative to this in the
|
||||
// tests. Mostly just makes reasoning/viewing times easier because the only
|
||||
// differences will be the changes we make.
|
||||
baseTime := time.Now()
|
||||
|
||||
manifestWithTime := func(
|
||||
mt time.Time,
|
||||
m *manifest.EntryMetadata,
|
||||
) *manifest.EntryMetadata {
|
||||
res := *m
|
||||
res.ModTime = mt
|
||||
|
||||
return &res
|
||||
}
|
||||
|
||||
backupWithTime := func(mt time.Time, b *backup.Backup) *backup.Backup {
|
||||
res := *b
|
||||
res.ModTime = mt
|
||||
|
||||
return &res
|
||||
}
|
||||
|
||||
table := []struct {
|
||||
name string
|
||||
snapshots []*manifest.EntryMetadata
|
||||
@ -231,12 +254,15 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
backups []backupRes
|
||||
backupListErr error
|
||||
deleteErr error
|
||||
time time.Time
|
||||
buffer time.Duration
|
||||
|
||||
expectDeleteIDs []manifest.ID
|
||||
expectErr assert.ErrorAssertionFunc
|
||||
}{
|
||||
{
|
||||
name: "EmptyRepo",
|
||||
time: baseTime,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
@ -253,6 +279,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
{bup: bupCurrent},
|
||||
{bup: bupLegacy},
|
||||
},
|
||||
time: baseTime,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
@ -277,6 +304,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
snapNoDetails.ID,
|
||||
deetsNoSnapshot.ID,
|
||||
},
|
||||
time: baseTime,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
@ -297,6 +325,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
manifest.ID(bupLegacy.ModelStoreID),
|
||||
manifest.ID(deetsLegacy.ModelStoreID),
|
||||
},
|
||||
time: baseTime,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
@ -315,6 +344,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
snapCurrent.ID,
|
||||
snapLegacy.ID,
|
||||
},
|
||||
time: baseTime,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
@ -334,6 +364,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
backups: []backupRes{
|
||||
{bup: bupCurrent},
|
||||
},
|
||||
time: baseTime,
|
||||
expectErr: assert.Error,
|
||||
},
|
||||
{
|
||||
@ -343,6 +374,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
deetsCurrent,
|
||||
},
|
||||
backupListErr: assert.AnError,
|
||||
time: baseTime,
|
||||
expectErr: assert.Error,
|
||||
},
|
||||
{
|
||||
@ -378,6 +410,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
snapNoDetails.ID,
|
||||
manifest.ID(bupNoDetails.ModelStoreID),
|
||||
},
|
||||
time: baseTime,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
@ -399,8 +432,77 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
},
|
||||
{bup: bupNoDetails},
|
||||
},
|
||||
time: baseTime,
|
||||
expectErr: assert.Error,
|
||||
},
|
||||
{
|
||||
name: "DeleteError Fails",
|
||||
snapshots: []*manifest.EntryMetadata{
|
||||
snapCurrent,
|
||||
deetsCurrent,
|
||||
snapLegacy,
|
||||
snapNoDetails,
|
||||
},
|
||||
detailsModels: []*model.BaseModel{
|
||||
deetsLegacy,
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: bupCurrent},
|
||||
{bup: bupLegacy},
|
||||
{bup: bupNoDetails},
|
||||
},
|
||||
expectDeleteIDs: []manifest.ID{
|
||||
snapNoDetails.ID,
|
||||
manifest.ID(bupNoDetails.ModelStoreID),
|
||||
},
|
||||
deleteErr: assert.AnError,
|
||||
time: baseTime,
|
||||
expectErr: assert.Error,
|
||||
},
|
||||
{
|
||||
name: "MissingSnapshot BarelyTooYoungForCleanup Noops",
|
||||
snapshots: []*manifest.EntryMetadata{
|
||||
manifestWithTime(baseTime, deetsCurrent),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupWithTime(baseTime, bupCurrent)},
|
||||
},
|
||||
time: baseTime.Add(24 * time.Hour),
|
||||
buffer: 24 * time.Hour,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
name: "MissingSnapshot BarelyOldEnough CausesCleanup",
|
||||
snapshots: []*manifest.EntryMetadata{
|
||||
manifestWithTime(baseTime, deetsCurrent),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupWithTime(baseTime, bupCurrent)},
|
||||
},
|
||||
expectDeleteIDs: []manifest.ID{
|
||||
deetsCurrent.ID,
|
||||
manifest.ID(bupCurrent.ModelStoreID),
|
||||
},
|
||||
time: baseTime.Add((24 * time.Hour) + time.Second),
|
||||
buffer: 24 * time.Hour,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
name: "BackupGetErrorNotFound TooYoung Noops",
|
||||
snapshots: []*manifest.EntryMetadata{
|
||||
manifestWithTime(baseTime, snapCurrent),
|
||||
manifestWithTime(baseTime, deetsCurrent),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{
|
||||
bup: backupWithTime(baseTime, bupCurrent),
|
||||
err: data.ErrNotFound,
|
||||
},
|
||||
},
|
||||
time: baseTime,
|
||||
buffer: 24 * time.Hour,
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range table {
|
||||
@ -426,7 +528,12 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
err: test.snapshotFetchErr,
|
||||
}
|
||||
|
||||
err := cleanupOrphanedData(ctx, mbs, mmf)
|
||||
err := cleanupOrphanedData(
|
||||
ctx,
|
||||
mbs,
|
||||
mmf,
|
||||
test.buffer,
|
||||
func() time.Time { return test.time })
|
||||
test.expectErr(t, err, clues.ToCore(err))
|
||||
})
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user