Tighten checks for assist base garbage collection (#4137)
Do a bit better job checking assist bases for garbage collection by considering whether there's a newer merge base already --- #### Does this PR need a docs update or release note? - [ ] ✅ Yes, it's included - [ ] 🕐 Yes, but in a later PR - [x] ⛔ No #### Type of change - [x] 🌻 Feature - [ ] 🐛 Bugfix - [ ] 🗺️ Documentation - [ ] 🤖 Supportability/Tests - [ ] 💻 CI/Deployment - [ ] 🧹 Tech Debt/Cleanup #### Issue(s) * #3217 #### Test Plan - [ ] 💪 Manual - [x] ⚡ Unit test - [ ] 💚 E2E
This commit is contained in:
parent
8e3525bbee
commit
4f839d3d75
@ -128,11 +128,16 @@ func cleanupOrphanedData(
|
||||
return clues.Wrap(err, "getting all backup models")
|
||||
}
|
||||
|
||||
// assistBackups is the set of backups that have a
|
||||
// * a label denoting they're an assist backup
|
||||
// * item data snapshot
|
||||
// * details snapshot
|
||||
var assistBackups []*backup.Backup
|
||||
var (
|
||||
// assistBackups is the set of backups that have a
|
||||
// * label denoting they're an assist backup
|
||||
// * item data snapshot
|
||||
// * details snapshot
|
||||
assistBackups []*backup.Backup
|
||||
// mostRecentMergeBase maps the reason to its most recent merge base's
|
||||
// creation time. The map key is created using keysForBackup.
|
||||
mostRecentMergeBase = map[string]time.Time{}
|
||||
)
|
||||
|
||||
for _, bup := range bups {
|
||||
// Don't even try to see if this needs garbage collected because it's not
|
||||
@ -167,7 +172,7 @@ func cleanupOrphanedData(
|
||||
// This isn't expected to really pop up, but it's possible if this
|
||||
// function is run concurrently with either a backup delete or another
|
||||
// instance of this function.
|
||||
logger.Ctx(ctx).Debugw(
|
||||
logger.Ctx(ctx).Infow(
|
||||
"backup model not found",
|
||||
"search_backup_id", bup.ModelStoreID)
|
||||
|
||||
@ -188,31 +193,57 @@ func cleanupOrphanedData(
|
||||
delete(toDelete, manifest.ID(bm.SnapshotID))
|
||||
delete(toDelete, manifest.ID(ssid))
|
||||
|
||||
// This is a little messy to have, but can simplify the logic below.
|
||||
// The state of tagging in corso isn't all that great right now and we'd
|
||||
// really like to consolidate tags and clean them up. For now, we're
|
||||
// going to copy tags that are related to Reasons for a backup from the
|
||||
// item data snapshot to the backup model. This makes the function
|
||||
// checking if assist backups should be garbage collected a bit easier
|
||||
// because now they only have to source data from backup models.
|
||||
if err := transferTags(d, &bm); err != nil {
|
||||
logger.CtxErr(ctx, err).Infow(
|
||||
"transferring legacy tags to backup model",
|
||||
"snapshot_id", d.ID,
|
||||
"backup_id", bup.ID)
|
||||
|
||||
// Continuing here means the base won't be eligible for old assist
|
||||
// base garbage collection or as a newer merge base timestamp.
|
||||
//
|
||||
// We could add more logic to eventually delete the base if it's an
|
||||
// assist base. If it's a merge base then it should be mostly harmless
|
||||
// as a newer merge base should cause older assist bases to be garbage
|
||||
// collected.
|
||||
//
|
||||
// Either way, I don't really expect to see failures when transferring
|
||||
// tags so not worth adding extra code for unless we see it become a
|
||||
// problem.
|
||||
continue
|
||||
}
|
||||
|
||||
// Add to the assist backup set so that we can attempt to garbage collect
|
||||
// older assist backups below.
|
||||
if bup.Tags[model.BackupTypeTag] == model.AssistBackup {
|
||||
// This is a little messy to have, but can simplify the logic below.
|
||||
// The state of tagging in corso isn't all that great right now and we'd
|
||||
// really like to consolidate tags and clean them up. For now, we're
|
||||
// going to copy tags that are related to Reasons for a backup from the
|
||||
// item data snapshot to the backup model. This makes the function
|
||||
// checking if assist backups should be garbage collected a bit easier
|
||||
// because now they only have to source data from backup models.
|
||||
if err := transferTags(d, &bm); err != nil {
|
||||
logger.Ctx(ctx).Debugw(
|
||||
"transferring legacy tags to backup model",
|
||||
"err", err,
|
||||
"snapshot_id", d.ID,
|
||||
"backup_id", bup.ID)
|
||||
assistBackups = append(assistBackups, &bm)
|
||||
continue
|
||||
}
|
||||
|
||||
// Continuing here means the base won't be eligible for old assist
|
||||
// base garbage collection. We could add more logic to eventually
|
||||
// delete the base in question but I don't really expect to see
|
||||
// failures when transferring tags.
|
||||
// If it's a merge base track the time it was created so we can check
|
||||
// later if we should remove all assist bases or not.
|
||||
tags, err := keysForBackup(&bm)
|
||||
if err != nil {
|
||||
logger.CtxErr(ctx, err).
|
||||
Info("getting Reason keys for merge base. May keep an additional assist base")
|
||||
}
|
||||
|
||||
for _, tag := range tags {
|
||||
t := mostRecentMergeBase[tag]
|
||||
if t.After(bm.CreationTime) {
|
||||
// Don't update the merge base time if we've already seen a newer
|
||||
// merge base.
|
||||
continue
|
||||
}
|
||||
|
||||
assistBackups = append(assistBackups, &bm)
|
||||
mostRecentMergeBase[tag] = bm.CreationTime
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -234,7 +265,7 @@ func cleanupOrphanedData(
|
||||
// This sort of edge case will ideally happen only for a few assist bases at
|
||||
// a time. Assuming this function is run somewhat periodically, missing these
|
||||
// edge cases is alright because they'll get picked up on a subsequent run.
|
||||
assistItems := collectOldAssistBases(ctx, assistBackups)
|
||||
assistItems := collectOldAssistBases(ctx, mostRecentMergeBase, assistBackups)
|
||||
|
||||
logger.Ctx(ctx).Debugw(
|
||||
"garbage collecting old assist bases",
|
||||
@ -263,6 +294,10 @@ func transferTags(snap *manifest.EntryMetadata, bup *backup.Backup) error {
|
||||
return clues.Wrap(err, "decoding tenant from label")
|
||||
}
|
||||
|
||||
if bup.Tags == nil {
|
||||
bup.Tags = map[string]string{}
|
||||
}
|
||||
|
||||
bup.Tags[tenantTag] = tenant
|
||||
|
||||
skipTags := map[string]struct{}{}
|
||||
@ -299,8 +334,40 @@ func transferTags(snap *manifest.EntryMetadata, bup *backup.Backup) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// keysForBackup returns a slice of string keys representing the Reasons for this
|
||||
// backup. If there's a problem creating the keys an error is returned.
|
||||
func keysForBackup(bup *backup.Backup) ([]string, error) {
|
||||
var (
|
||||
res []string
|
||||
// Safe to pull from this field since assist backups came after we switched
|
||||
// to using ProtectedResourceID.
|
||||
roid = bup.ProtectedResourceID
|
||||
)
|
||||
|
||||
tenant := bup.Tags[tenantTag]
|
||||
if len(tenant) == 0 {
|
||||
// We can skip this backup. It won't get garbage collected, but it also
|
||||
// won't result in incorrect behavior overall.
|
||||
return nil, clues.New("missing tenant tag in backup").
|
||||
With("backup_id", bup.ID)
|
||||
}
|
||||
|
||||
for tag := range bup.Tags {
|
||||
if strings.HasPrefix(tag, serviceCatTagPrefix) {
|
||||
// Precise way we concatenate all this info doesn't really matter as
|
||||
// long as it's consistent for all backups in the set and includes all
|
||||
// the pieces we need to ensure uniqueness across.
|
||||
fullTag := tenant + roid + tag
|
||||
res = append(res, fullTag)
|
||||
}
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func collectOldAssistBases(
|
||||
ctx context.Context,
|
||||
mostRecentMergeBase map[string]time.Time,
|
||||
bups []*backup.Backup,
|
||||
) []manifest.ID {
|
||||
// maybeDelete is the set of backups that could be deleted. It starts out as
|
||||
@ -312,28 +379,16 @@ func collectOldAssistBases(
|
||||
bupsByReason := map[string][]*backup.Backup{}
|
||||
|
||||
for _, bup := range bups {
|
||||
// Safe to pull from this field since assist backups came after we switched
|
||||
// to using ProtectedResourceID.
|
||||
roid := bup.ProtectedResourceID
|
||||
|
||||
tenant := bup.Tags[tenantTag]
|
||||
if len(tenant) == 0 {
|
||||
// We can skip this backup. It won't get garbage collected, but it also
|
||||
// won't result in incorrect behavior overall.
|
||||
logger.Ctx(ctx).Infow("missing tenant tag in backup", "backup_id", bup.ID)
|
||||
tags, err := keysForBackup(bup)
|
||||
if err != nil {
|
||||
logger.CtxErr(ctx, err).Error("not checking backup for garbage collection")
|
||||
continue
|
||||
}
|
||||
|
||||
maybeDelete[manifest.ID(bup.ModelStoreID)] = bup
|
||||
|
||||
for tag := range bup.Tags {
|
||||
if strings.HasPrefix(tag, serviceCatTagPrefix) {
|
||||
// Precise way we concatenate all this info doesn't really matter as
|
||||
// long as it's consistent for all backups in the set and includes all
|
||||
// the pieces we need to ensure uniqueness across.
|
||||
fullTag := tenant + roid + tag
|
||||
bupsByReason[fullTag] = append(bupsByReason[fullTag], bup)
|
||||
}
|
||||
for _, tag := range tags {
|
||||
bupsByReason[tag] = append(bupsByReason[tag], bup)
|
||||
}
|
||||
}
|
||||
|
||||
@ -351,7 +406,7 @@ func collectOldAssistBases(
|
||||
//
|
||||
// TODO(ashmrtn): Handle concurrent backups somehow? Right now backups that
|
||||
// have overlapping start and end times aren't explicitly handled.
|
||||
for _, bupSet := range bupsByReason {
|
||||
for tag, bupSet := range bupsByReason {
|
||||
if len(bupSet) == 0 {
|
||||
continue
|
||||
}
|
||||
@ -366,7 +421,14 @@ func collectOldAssistBases(
|
||||
return -a.CreationTime.Compare(b.CreationTime)
|
||||
})
|
||||
|
||||
delete(maybeDelete, manifest.ID(bupSet[0].ModelStoreID))
|
||||
// Only remove the youngest assist base from the deletion set if we don't
|
||||
// have a merge base that's younger than it. We don't need to check if the
|
||||
// value is in the map here because the zero time is always at least as old
|
||||
// as the times we'll see in our backups (if we see the zero time in our
|
||||
// backup it's a bug but will still pass the check to keep the backup).
|
||||
if t := mostRecentMergeBase[tag]; !bupSet[0].CreationTime.Before(t) {
|
||||
delete(maybeDelete, manifest.ID(bupSet[0].ModelStoreID))
|
||||
}
|
||||
}
|
||||
|
||||
res := make([]manifest.ID, 0, 3*len(maybeDelete))
|
||||
|
||||
@ -351,15 +351,17 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
return &res
|
||||
}
|
||||
|
||||
backupAssist := func(protectedResource string, b *backup.Backup) *backup.Backup {
|
||||
backupWithResource := func(protectedResource string, isAssist bool, b *backup.Backup) *backup.Backup {
|
||||
res := *b
|
||||
res.ProtectedResourceID = protectedResource
|
||||
|
||||
if res.Tags == nil {
|
||||
res.Tags = map[string]string{}
|
||||
}
|
||||
if isAssist {
|
||||
if res.Tags == nil {
|
||||
res.Tags = map[string]string{}
|
||||
}
|
||||
|
||||
res.Tags[model.BackupTypeTag] = model.AssistBackup
|
||||
res.Tags[model.BackupTypeTag] = model.AssistBackup
|
||||
}
|
||||
|
||||
return &res
|
||||
}
|
||||
@ -646,8 +648,8 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
},
|
||||
time: baseTime,
|
||||
buffer: 24 * time.Hour,
|
||||
@ -677,9 +679,9 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
manifestWithTime(baseTime.Add(time.Minute), deetsCurrent3()),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Minute), bupCurrent3()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Minute), bupCurrent3()))},
|
||||
},
|
||||
expectDeleteIDs: []manifest.ID{
|
||||
snapCurrent().ID,
|
||||
@ -694,11 +696,9 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
expectErr: assert.NoError,
|
||||
},
|
||||
{
|
||||
// Test that the most recent assist base is not garbage collected even if
|
||||
// there's a newer merge base that has the same Reasons as the assist
|
||||
// base. Also ensure assist bases with the same Reasons that are older
|
||||
// than the newest assist base are still garbage collected.
|
||||
name: "AssistBasesAndMergeBase NotYoungest CausesCleanupForAssistBase",
|
||||
// Test that the most recent assist base is garbage collected if there's a
|
||||
// newer merge base that has the same Reasons as the assist base.
|
||||
name: "AssistBasesAndMergeBases NotYoungest CausesCleanupForAssistBase",
|
||||
snapshots: []*manifest.EntryMetadata{
|
||||
manifestWithReasons(
|
||||
manifestWithTime(baseTime, snapCurrent()),
|
||||
@ -707,21 +707,21 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
manifestWithTime(baseTime, deetsCurrent()),
|
||||
|
||||
manifestWithReasons(
|
||||
manifestWithTime(baseTime.Add(time.Second), snapCurrent2()),
|
||||
manifestWithTime(baseTime.Add(time.Minute), snapCurrent2()),
|
||||
"tenant1",
|
||||
NewReason("", "ro", path.ExchangeService, path.EmailCategory)),
|
||||
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
|
||||
manifestWithTime(baseTime.Add(time.Minute), deetsCurrent2()),
|
||||
|
||||
manifestWithReasons(
|
||||
manifestWithTime(baseTime.Add(time.Minute), snapCurrent3()),
|
||||
manifestWithTime(baseTime.Add(time.Second), snapCurrent3()),
|
||||
"tenant1",
|
||||
NewReason("", "ro", path.ExchangeService, path.EmailCategory)),
|
||||
manifestWithTime(baseTime.Add(time.Minute), deetsCurrent3()),
|
||||
manifestWithTime(baseTime.Add(time.Second), deetsCurrent3()),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupWithTime(baseTime.Add(time.Minute), bupCurrent3())},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupWithResource("ro", false, backupWithTime(baseTime.Add(time.Minute), bupCurrent2()))},
|
||||
{bup: backupWithResource("ro", false, backupWithTime(baseTime.Add(-time.Second), bupCurrent3()))},
|
||||
},
|
||||
expectDeleteIDs: []manifest.ID{
|
||||
snapCurrent().ID,
|
||||
@ -751,8 +751,8 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
},
|
||||
time: baseTime.Add(48 * time.Hour),
|
||||
buffer: 24 * time.Hour,
|
||||
@ -778,8 +778,8 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupAssist("ro1", backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupAssist("ro2", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupWithResource("ro1", true, backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupWithResource("ro2", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
},
|
||||
time: baseTime.Add(48 * time.Hour),
|
||||
buffer: 24 * time.Hour,
|
||||
@ -805,8 +805,8 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
},
|
||||
time: baseTime.Add(48 * time.Hour),
|
||||
buffer: 24 * time.Hour,
|
||||
@ -838,9 +838,9 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
|
||||
manifestWithTime(baseTime.Add(time.Minute), deetsCurrent3()),
|
||||
},
|
||||
backups: []backupRes{
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Minute), bupCurrent3()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
|
||||
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Minute), bupCurrent3()))},
|
||||
},
|
||||
time: baseTime.Add(48 * time.Hour),
|
||||
buffer: 24 * time.Hour,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user