Exclude recently created models from garbage collection (#4066)
Exclude models that have been created within the buffer period from garbage collection/orphaned checks so that we don't accidentally delete models for backups that are running concurrently with the garbage collection task --- #### Does this PR need a docs update or release note? - [ ] ✅ Yes, it's included - [x] 🕐 Yes, but in a later PR - [ ] ⛔ No #### Type of change - [x] 🌻 Feature - [ ] 🐛 Bugfix - [ ] 🗺️ Documentation - [ ] 🤖 Supportability/Tests - [ ] 💻 CI/Deployment - [ ] 🧹 Tech Debt/Cleanup #### Issue(s) * #3217 #### Test Plan - [ ] 💪 Manual - [x] ⚡ Unit test - [ ] 💚 E2E
This commit is contained in:
parent
99edf7d5b0
commit
11253bf816
@ -3,6 +3,7 @@ package kopia
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/alcionai/clues"
|
"github.com/alcionai/clues"
|
||||||
"github.com/kopia/kopia/repo/manifest"
|
"github.com/kopia/kopia/repo/manifest"
|
||||||
@ -16,10 +17,37 @@ import (
|
|||||||
"github.com/alcionai/corso/src/pkg/store"
|
"github.com/alcionai/corso/src/pkg/store"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// cleanupOrphanedData uses bs and mf to lookup all models/snapshots for backups
|
||||||
|
// and deletes items that are older than nowFunc() - gcBuffer (cutoff) that are
|
||||||
|
// not "complete" backups with:
|
||||||
|
// - a backup model
|
||||||
|
// - an item data snapshot
|
||||||
|
// - a details snapshot or details model
|
||||||
|
//
|
||||||
|
// We exclude all items younger than the cutoff to add some buffer so that even
|
||||||
|
// if this is run concurrently with a backup it's not likely to delete models
|
||||||
|
// just being created. For example, if there was no buffer period and this is
|
||||||
|
// run when another corso instance has created an item data snapshot but hasn't
|
||||||
|
// yet created the details snapshot or the backup model it would result in this
|
||||||
|
// instance of corso marking the newly created item data snapshot for deletion
|
||||||
|
// because it appears orphaned.
|
||||||
|
//
|
||||||
|
// The buffer duration should be longer than the difference in creation times
|
||||||
|
// between the first item data snapshot/details/backup model made during a
|
||||||
|
// backup operation and the last.
|
||||||
|
//
|
||||||
|
// We don't have hard numbers on the time right now, but if the order of
|
||||||
|
// persistence is (item data snapshot, details snapshot, backup model) it should
|
||||||
|
// be faster than creating the snapshot itself and probably happens O(minutes)
|
||||||
|
// or O(hours) instead of O(days). Of course, that assumes a non-adversarial
|
||||||
|
// setup where things such as machine hiberation, process freezing (i.e. paused
|
||||||
|
// at the OS level), etc. don't occur.
|
||||||
func cleanupOrphanedData(
|
func cleanupOrphanedData(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
bs store.Storer,
|
bs store.Storer,
|
||||||
mf manifestFinder,
|
mf manifestFinder,
|
||||||
|
gcBuffer time.Duration,
|
||||||
|
nowFunc func() time.Time,
|
||||||
) error {
|
) error {
|
||||||
// Get all snapshot manifests.
|
// Get all snapshot manifests.
|
||||||
snaps, err := mf.FindManifests(
|
snaps, err := mf.FindManifests(
|
||||||
@ -43,27 +71,16 @@ func cleanupOrphanedData(
|
|||||||
dataSnaps = map[manifest.ID]struct{}{}
|
dataSnaps = map[manifest.ID]struct{}{}
|
||||||
)
|
)
|
||||||
|
|
||||||
// TODO(ashmrtn): Exclude all snapshots and details younger than X <duration>.
|
cutoff := nowFunc().Add(-gcBuffer)
|
||||||
// Doing so adds some buffer so that even if this is run concurrently with a
|
|
||||||
// backup it's not likely to delete models just being created. For example,
|
|
||||||
// running this when another corso instance has created an item data snapshot
|
|
||||||
// but hasn't yet created the details snapshot or the backup model would
|
|
||||||
// result in this instance of corso marking the newly created item data
|
|
||||||
// snapshot for deletion because it appears orphaned.
|
|
||||||
//
|
|
||||||
// Excluding only snapshots and details models works for now since the backup
|
|
||||||
// model is the last thing persisted out of them. If we switch the order of
|
|
||||||
// persistence then this will need updated as well.
|
|
||||||
//
|
|
||||||
// The buffer duration should be longer than the time it would take to do
|
|
||||||
// details merging and backup model creation. We don't have hard numbers on
|
|
||||||
// that, but it should be faster than creating the snapshot itself and
|
|
||||||
// probably happens O(minutes) or O(hours) instead of O(days). Of course, that
|
|
||||||
// assumes a non-adversarial setup where things such as machine hiberation,
|
|
||||||
// process freezing (i.e. paused at the OS level), etc. don't occur.
|
|
||||||
|
|
||||||
// Sort all the snapshots as either details snapshots or item data snapshots.
|
// Sort all the snapshots as either details snapshots or item data snapshots.
|
||||||
for _, snap := range snaps {
|
for _, snap := range snaps {
|
||||||
|
// Don't even try to see if this needs garbage collected because it's not
|
||||||
|
// old enough and may correspond to an in-progress operation.
|
||||||
|
if !cutoff.After(snap.ModTime) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
k, _ := makeTagKV(TagBackupCategory)
|
k, _ := makeTagKV(TagBackupCategory)
|
||||||
if _, ok := snap.Labels[k]; ok {
|
if _, ok := snap.Labels[k]; ok {
|
||||||
dataSnaps[snap.ID] = struct{}{}
|
dataSnaps[snap.ID] = struct{}{}
|
||||||
@ -82,6 +99,12 @@ func cleanupOrphanedData(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, d := range deetsModels {
|
for _, d := range deetsModels {
|
||||||
|
// Don't even try to see if this needs garbage collected because it's not
|
||||||
|
// old enough and may correspond to an in-progress operation.
|
||||||
|
if !cutoff.After(d.ModTime) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
deets[d.ModelStoreID] = struct{}{}
|
deets[d.ModelStoreID] = struct{}{}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,6 +118,12 @@ func cleanupOrphanedData(
|
|||||||
maps.Copy(toDelete, dataSnaps)
|
maps.Copy(toDelete, dataSnaps)
|
||||||
|
|
||||||
for _, bup := range bups {
|
for _, bup := range bups {
|
||||||
|
// Don't even try to see if this needs garbage collected because it's not
|
||||||
|
// old enough and may correspond to an in-progress operation.
|
||||||
|
if !cutoff.After(bup.ModTime) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{}
|
toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{}
|
||||||
|
|
||||||
bm := backup.Backup{}
|
bm := backup.Backup{}
|
||||||
@ -110,12 +139,13 @@ func cleanupOrphanedData(
|
|||||||
With("search_backup_id", bup.ID)
|
With("search_backup_id", bup.ID)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(ashmrtn): This actually needs revised, see above TODO. Leaving it
|
// Probably safe to continue if the model wasn't found because that means
|
||||||
// here for the moment to get the basic logic in.
|
// that the possible item data and details for the backup are now
|
||||||
|
// orphaned. They'll be deleted since we won't remove them from the delete
|
||||||
|
// set.
|
||||||
//
|
//
|
||||||
// Safe to continue if the model wasn't found because that means that the
|
// The fact that we exclude all items younger than the cutoff should
|
||||||
// possible item data and details for the backup are now orphaned. They'll
|
// already exclude items that are from concurrent corso backup operations.
|
||||||
// be deleted since we won't remove them from the delete set.
|
|
||||||
//
|
//
|
||||||
// This isn't expected to really pop up, but it's possible if this
|
// This isn't expected to really pop up, but it's possible if this
|
||||||
// function is run concurrently with either a backup delete or another
|
// function is run concurrently with either a backup delete or another
|
||||||
@ -143,6 +173,11 @@ func cleanupOrphanedData(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.Ctx(ctx).Infow(
|
||||||
|
"garbage collecting orphaned items",
|
||||||
|
"num_items", len(toDelete),
|
||||||
|
"kopia_ids", maps.Keys(toDelete))
|
||||||
|
|
||||||
// Use single atomic batch delete operation to cleanup to keep from making a
|
// Use single atomic batch delete operation to cleanup to keep from making a
|
||||||
// bunch of manifest content blobs.
|
// bunch of manifest content blobs.
|
||||||
if err := bs.DeleteWithModelStoreIDs(ctx, maps.Keys(toDelete)...); err != nil {
|
if err := bs.DeleteWithModelStoreIDs(ctx, maps.Keys(toDelete)...); err != nil {
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/alcionai/clues"
|
"github.com/alcionai/clues"
|
||||||
"github.com/kopia/kopia/repo/manifest"
|
"github.com/kopia/kopia/repo/manifest"
|
||||||
@ -221,6 +222,28 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get some stable time so that we can do everything relative to this in the
|
||||||
|
// tests. Mostly just makes reasoning/viewing times easier because the only
|
||||||
|
// differences will be the changes we make.
|
||||||
|
baseTime := time.Now()
|
||||||
|
|
||||||
|
manifestWithTime := func(
|
||||||
|
mt time.Time,
|
||||||
|
m *manifest.EntryMetadata,
|
||||||
|
) *manifest.EntryMetadata {
|
||||||
|
res := *m
|
||||||
|
res.ModTime = mt
|
||||||
|
|
||||||
|
return &res
|
||||||
|
}
|
||||||
|
|
||||||
|
backupWithTime := func(mt time.Time, b *backup.Backup) *backup.Backup {
|
||||||
|
res := *b
|
||||||
|
res.ModTime = mt
|
||||||
|
|
||||||
|
return &res
|
||||||
|
}
|
||||||
|
|
||||||
table := []struct {
|
table := []struct {
|
||||||
name string
|
name string
|
||||||
snapshots []*manifest.EntryMetadata
|
snapshots []*manifest.EntryMetadata
|
||||||
@ -231,12 +254,15 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
backups []backupRes
|
backups []backupRes
|
||||||
backupListErr error
|
backupListErr error
|
||||||
deleteErr error
|
deleteErr error
|
||||||
|
time time.Time
|
||||||
|
buffer time.Duration
|
||||||
|
|
||||||
expectDeleteIDs []manifest.ID
|
expectDeleteIDs []manifest.ID
|
||||||
expectErr assert.ErrorAssertionFunc
|
expectErr assert.ErrorAssertionFunc
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "EmptyRepo",
|
name: "EmptyRepo",
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.NoError,
|
expectErr: assert.NoError,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -253,6 +279,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
{bup: bupCurrent},
|
{bup: bupCurrent},
|
||||||
{bup: bupLegacy},
|
{bup: bupLegacy},
|
||||||
},
|
},
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.NoError,
|
expectErr: assert.NoError,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -277,6 +304,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
snapNoDetails.ID,
|
snapNoDetails.ID,
|
||||||
deetsNoSnapshot.ID,
|
deetsNoSnapshot.ID,
|
||||||
},
|
},
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.NoError,
|
expectErr: assert.NoError,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -297,6 +325,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
manifest.ID(bupLegacy.ModelStoreID),
|
manifest.ID(bupLegacy.ModelStoreID),
|
||||||
manifest.ID(deetsLegacy.ModelStoreID),
|
manifest.ID(deetsLegacy.ModelStoreID),
|
||||||
},
|
},
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.NoError,
|
expectErr: assert.NoError,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -315,6 +344,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
snapCurrent.ID,
|
snapCurrent.ID,
|
||||||
snapLegacy.ID,
|
snapLegacy.ID,
|
||||||
},
|
},
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.NoError,
|
expectErr: assert.NoError,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -334,6 +364,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
backups: []backupRes{
|
backups: []backupRes{
|
||||||
{bup: bupCurrent},
|
{bup: bupCurrent},
|
||||||
},
|
},
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.Error,
|
expectErr: assert.Error,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -343,6 +374,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
deetsCurrent,
|
deetsCurrent,
|
||||||
},
|
},
|
||||||
backupListErr: assert.AnError,
|
backupListErr: assert.AnError,
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.Error,
|
expectErr: assert.Error,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -378,6 +410,7 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
snapNoDetails.ID,
|
snapNoDetails.ID,
|
||||||
manifest.ID(bupNoDetails.ModelStoreID),
|
manifest.ID(bupNoDetails.ModelStoreID),
|
||||||
},
|
},
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.NoError,
|
expectErr: assert.NoError,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -399,8 +432,77 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
},
|
},
|
||||||
{bup: bupNoDetails},
|
{bup: bupNoDetails},
|
||||||
},
|
},
|
||||||
|
time: baseTime,
|
||||||
expectErr: assert.Error,
|
expectErr: assert.Error,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "DeleteError Fails",
|
||||||
|
snapshots: []*manifest.EntryMetadata{
|
||||||
|
snapCurrent,
|
||||||
|
deetsCurrent,
|
||||||
|
snapLegacy,
|
||||||
|
snapNoDetails,
|
||||||
|
},
|
||||||
|
detailsModels: []*model.BaseModel{
|
||||||
|
deetsLegacy,
|
||||||
|
},
|
||||||
|
backups: []backupRes{
|
||||||
|
{bup: bupCurrent},
|
||||||
|
{bup: bupLegacy},
|
||||||
|
{bup: bupNoDetails},
|
||||||
|
},
|
||||||
|
expectDeleteIDs: []manifest.ID{
|
||||||
|
snapNoDetails.ID,
|
||||||
|
manifest.ID(bupNoDetails.ModelStoreID),
|
||||||
|
},
|
||||||
|
deleteErr: assert.AnError,
|
||||||
|
time: baseTime,
|
||||||
|
expectErr: assert.Error,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "MissingSnapshot BarelyTooYoungForCleanup Noops",
|
||||||
|
snapshots: []*manifest.EntryMetadata{
|
||||||
|
manifestWithTime(baseTime, deetsCurrent),
|
||||||
|
},
|
||||||
|
backups: []backupRes{
|
||||||
|
{bup: backupWithTime(baseTime, bupCurrent)},
|
||||||
|
},
|
||||||
|
time: baseTime.Add(24 * time.Hour),
|
||||||
|
buffer: 24 * time.Hour,
|
||||||
|
expectErr: assert.NoError,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "MissingSnapshot BarelyOldEnough CausesCleanup",
|
||||||
|
snapshots: []*manifest.EntryMetadata{
|
||||||
|
manifestWithTime(baseTime, deetsCurrent),
|
||||||
|
},
|
||||||
|
backups: []backupRes{
|
||||||
|
{bup: backupWithTime(baseTime, bupCurrent)},
|
||||||
|
},
|
||||||
|
expectDeleteIDs: []manifest.ID{
|
||||||
|
deetsCurrent.ID,
|
||||||
|
manifest.ID(bupCurrent.ModelStoreID),
|
||||||
|
},
|
||||||
|
time: baseTime.Add((24 * time.Hour) + time.Second),
|
||||||
|
buffer: 24 * time.Hour,
|
||||||
|
expectErr: assert.NoError,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "BackupGetErrorNotFound TooYoung Noops",
|
||||||
|
snapshots: []*manifest.EntryMetadata{
|
||||||
|
manifestWithTime(baseTime, snapCurrent),
|
||||||
|
manifestWithTime(baseTime, deetsCurrent),
|
||||||
|
},
|
||||||
|
backups: []backupRes{
|
||||||
|
{
|
||||||
|
bup: backupWithTime(baseTime, bupCurrent),
|
||||||
|
err: data.ErrNotFound,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
time: baseTime,
|
||||||
|
buffer: 24 * time.Hour,
|
||||||
|
expectErr: assert.NoError,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range table {
|
for _, test := range table {
|
||||||
@ -426,7 +528,12 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
|||||||
err: test.snapshotFetchErr,
|
err: test.snapshotFetchErr,
|
||||||
}
|
}
|
||||||
|
|
||||||
err := cleanupOrphanedData(ctx, mbs, mmf)
|
err := cleanupOrphanedData(
|
||||||
|
ctx,
|
||||||
|
mbs,
|
||||||
|
mmf,
|
||||||
|
test.buffer,
|
||||||
|
func() time.Time { return test.time })
|
||||||
test.expectErr(t, err, clues.ToCore(err))
|
test.expectErr(t, err, clues.ToCore(err))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user