Tighten checks for assist base garbage collection (#4137)

Do a bit better job checking assist bases for
garbage collection by considering whether
there's a newer merge base already

---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No

#### Type of change

- [x] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

* #3217

#### Test Plan

- [ ] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-08-29 09:15:21 -07:00 committed by GitHub
parent 8e3525bbee
commit 4f839d3d75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 137 additions and 75 deletions

View File

@ -128,11 +128,16 @@ func cleanupOrphanedData(
return clues.Wrap(err, "getting all backup models")
}
var (
// assistBackups is the set of backups that have a
// * a label denoting they're an assist backup
// * label denoting they're an assist backup
// * item data snapshot
// * details snapshot
var assistBackups []*backup.Backup
assistBackups []*backup.Backup
// mostRecentMergeBase maps the reason to its most recent merge base's
// creation time. The map key is created using keysForBackup.
mostRecentMergeBase = map[string]time.Time{}
)
for _, bup := range bups {
// Don't even try to see if this needs garbage collected because it's not
@ -167,7 +172,7 @@ func cleanupOrphanedData(
// This isn't expected to really pop up, but it's possible if this
// function is run concurrently with either a backup delete or another
// instance of this function.
logger.Ctx(ctx).Debugw(
logger.Ctx(ctx).Infow(
"backup model not found",
"search_backup_id", bup.ModelStoreID)
@ -188,9 +193,6 @@ func cleanupOrphanedData(
delete(toDelete, manifest.ID(bm.SnapshotID))
delete(toDelete, manifest.ID(ssid))
// Add to the assist backup set so that we can attempt to garbage collect
// older assist backups below.
if bup.Tags[model.BackupTypeTag] == model.AssistBackup {
// This is a little messy to have, but can simplify the logic below.
// The state of tagging in corso isn't all that great right now and we'd
// really like to consolidate tags and clean them up. For now, we're
@ -199,20 +201,49 @@ func cleanupOrphanedData(
// checking if assist backups should be garbage collected a bit easier
// because now they only have to source data from backup models.
if err := transferTags(d, &bm); err != nil {
logger.Ctx(ctx).Debugw(
logger.CtxErr(ctx, err).Infow(
"transferring legacy tags to backup model",
"err", err,
"snapshot_id", d.ID,
"backup_id", bup.ID)
// Continuing here means the base won't be eligible for old assist
// base garbage collection. We could add more logic to eventually
// delete the base in question but I don't really expect to see
// failures when transferring tags.
// base garbage collection or as a newer merge base timestamp.
//
// We could add more logic to eventually delete the base if it's an
// assist base. If it's a merge base then it should be mostly harmless
// as a newer merge base should cause older assist bases to be garbage
// collected.
//
// Either way, I don't really expect to see failures when transferring
// tags so not worth adding extra code for unless we see it become a
// problem.
continue
}
// Add to the assist backup set so that we can attempt to garbage collect
// older assist backups below.
if bup.Tags[model.BackupTypeTag] == model.AssistBackup {
assistBackups = append(assistBackups, &bm)
continue
}
// If it's a merge base track the time it was created so we can check
// later if we should remove all assist bases or not.
tags, err := keysForBackup(&bm)
if err != nil {
logger.CtxErr(ctx, err).
Info("getting Reason keys for merge base. May keep an additional assist base")
}
for _, tag := range tags {
t := mostRecentMergeBase[tag]
if t.After(bm.CreationTime) {
// Don't update the merge base time if we've already seen a newer
// merge base.
continue
}
mostRecentMergeBase[tag] = bm.CreationTime
}
}
}
@ -234,7 +265,7 @@ func cleanupOrphanedData(
// This sort of edge case will ideally happen only for a few assist bases at
// a time. Assuming this function is run somewhat periodically, missing these
// edge cases is alright because they'll get picked up on a subsequent run.
assistItems := collectOldAssistBases(ctx, assistBackups)
assistItems := collectOldAssistBases(ctx, mostRecentMergeBase, assistBackups)
logger.Ctx(ctx).Debugw(
"garbage collecting old assist bases",
@ -263,6 +294,10 @@ func transferTags(snap *manifest.EntryMetadata, bup *backup.Backup) error {
return clues.Wrap(err, "decoding tenant from label")
}
if bup.Tags == nil {
bup.Tags = map[string]string{}
}
bup.Tags[tenantTag] = tenant
skipTags := map[string]struct{}{}
@ -299,8 +334,40 @@ func transferTags(snap *manifest.EntryMetadata, bup *backup.Backup) error {
return nil
}
// keysForBackup returns a slice of string keys representing the Reasons for this
// backup. If there's a problem creating the keys an error is returned.
func keysForBackup(bup *backup.Backup) ([]string, error) {
var (
res []string
// Safe to pull from this field since assist backups came after we switched
// to using ProtectedResourceID.
roid = bup.ProtectedResourceID
)
tenant := bup.Tags[tenantTag]
if len(tenant) == 0 {
// We can skip this backup. It won't get garbage collected, but it also
// won't result in incorrect behavior overall.
return nil, clues.New("missing tenant tag in backup").
With("backup_id", bup.ID)
}
for tag := range bup.Tags {
if strings.HasPrefix(tag, serviceCatTagPrefix) {
// Precise way we concatenate all this info doesn't really matter as
// long as it's consistent for all backups in the set and includes all
// the pieces we need to ensure uniqueness across.
fullTag := tenant + roid + tag
res = append(res, fullTag)
}
}
return res, nil
}
func collectOldAssistBases(
ctx context.Context,
mostRecentMergeBase map[string]time.Time,
bups []*backup.Backup,
) []manifest.ID {
// maybeDelete is the set of backups that could be deleted. It starts out as
@ -312,28 +379,16 @@ func collectOldAssistBases(
bupsByReason := map[string][]*backup.Backup{}
for _, bup := range bups {
// Safe to pull from this field since assist backups came after we switched
// to using ProtectedResourceID.
roid := bup.ProtectedResourceID
tenant := bup.Tags[tenantTag]
if len(tenant) == 0 {
// We can skip this backup. It won't get garbage collected, but it also
// won't result in incorrect behavior overall.
logger.Ctx(ctx).Infow("missing tenant tag in backup", "backup_id", bup.ID)
tags, err := keysForBackup(bup)
if err != nil {
logger.CtxErr(ctx, err).Error("not checking backup for garbage collection")
continue
}
maybeDelete[manifest.ID(bup.ModelStoreID)] = bup
for tag := range bup.Tags {
if strings.HasPrefix(tag, serviceCatTagPrefix) {
// Precise way we concatenate all this info doesn't really matter as
// long as it's consistent for all backups in the set and includes all
// the pieces we need to ensure uniqueness across.
fullTag := tenant + roid + tag
bupsByReason[fullTag] = append(bupsByReason[fullTag], bup)
}
for _, tag := range tags {
bupsByReason[tag] = append(bupsByReason[tag], bup)
}
}
@ -351,7 +406,7 @@ func collectOldAssistBases(
//
// TODO(ashmrtn): Handle concurrent backups somehow? Right now backups that
// have overlapping start and end times aren't explicitly handled.
for _, bupSet := range bupsByReason {
for tag, bupSet := range bupsByReason {
if len(bupSet) == 0 {
continue
}
@ -366,8 +421,15 @@ func collectOldAssistBases(
return -a.CreationTime.Compare(b.CreationTime)
})
// Only remove the youngest assist base from the deletion set if we don't
// have a merge base that's younger than it. We don't need to check if the
// value is in the map here because the zero time is always at least as old
// as the times we'll see in our backups (if we see the zero time in our
// backup it's a bug but will still pass the check to keep the backup).
if t := mostRecentMergeBase[tag]; !bupSet[0].CreationTime.Before(t) {
delete(maybeDelete, manifest.ID(bupSet[0].ModelStoreID))
}
}
res := make([]manifest.ID, 0, 3*len(maybeDelete))

View File

@ -351,15 +351,17 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
return &res
}
backupAssist := func(protectedResource string, b *backup.Backup) *backup.Backup {
backupWithResource := func(protectedResource string, isAssist bool, b *backup.Backup) *backup.Backup {
res := *b
res.ProtectedResourceID = protectedResource
if isAssist {
if res.Tags == nil {
res.Tags = map[string]string{}
}
res.Tags[model.BackupTypeTag] = model.AssistBackup
}
return &res
}
@ -646,8 +648,8 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
},
backups: []backupRes{
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
},
time: baseTime,
buffer: 24 * time.Hour,
@ -677,9 +679,9 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifestWithTime(baseTime.Add(time.Minute), deetsCurrent3()),
},
backups: []backupRes{
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Minute), bupCurrent3()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Minute), bupCurrent3()))},
},
expectDeleteIDs: []manifest.ID{
snapCurrent().ID,
@ -694,11 +696,9 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
expectErr: assert.NoError,
},
{
// Test that the most recent assist base is not garbage collected even if
// there's a newer merge base that has the same Reasons as the assist
// base. Also ensure assist bases with the same Reasons that are older
// than the newest assist base are still garbage collected.
name: "AssistBasesAndMergeBase NotYoungest CausesCleanupForAssistBase",
// Test that the most recent assist base is garbage collected if there's a
// newer merge base that has the same Reasons as the assist base.
name: "AssistBasesAndMergeBases NotYoungest CausesCleanupForAssistBase",
snapshots: []*manifest.EntryMetadata{
manifestWithReasons(
manifestWithTime(baseTime, snapCurrent()),
@ -707,21 +707,21 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifestWithTime(baseTime, deetsCurrent()),
manifestWithReasons(
manifestWithTime(baseTime.Add(time.Second), snapCurrent2()),
manifestWithTime(baseTime.Add(time.Minute), snapCurrent2()),
"tenant1",
NewReason("", "ro", path.ExchangeService, path.EmailCategory)),
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
manifestWithTime(baseTime.Add(time.Minute), deetsCurrent2()),
manifestWithReasons(
manifestWithTime(baseTime.Add(time.Minute), snapCurrent3()),
manifestWithTime(baseTime.Add(time.Second), snapCurrent3()),
"tenant1",
NewReason("", "ro", path.ExchangeService, path.EmailCategory)),
manifestWithTime(baseTime.Add(time.Minute), deetsCurrent3()),
manifestWithTime(baseTime.Add(time.Second), deetsCurrent3()),
},
backups: []backupRes{
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupWithTime(baseTime.Add(time.Minute), bupCurrent3())},
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
{bup: backupWithResource("ro", false, backupWithTime(baseTime.Add(time.Minute), bupCurrent2()))},
{bup: backupWithResource("ro", false, backupWithTime(baseTime.Add(-time.Second), bupCurrent3()))},
},
expectDeleteIDs: []manifest.ID{
snapCurrent().ID,
@ -751,8 +751,8 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
},
backups: []backupRes{
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
},
time: baseTime.Add(48 * time.Hour),
buffer: 24 * time.Hour,
@ -778,8 +778,8 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
},
backups: []backupRes{
{bup: backupAssist("ro1", backupWithTime(baseTime, bupCurrent()))},
{bup: backupAssist("ro2", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupWithResource("ro1", true, backupWithTime(baseTime, bupCurrent()))},
{bup: backupWithResource("ro2", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
},
time: baseTime.Add(48 * time.Hour),
buffer: 24 * time.Hour,
@ -805,8 +805,8 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifestWithTime(baseTime.Add(time.Second), deetsCurrent2()),
},
backups: []backupRes{
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
},
time: baseTime.Add(48 * time.Hour),
buffer: 24 * time.Hour,
@ -838,9 +838,9 @@ func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
manifestWithTime(baseTime.Add(time.Minute), deetsCurrent3()),
},
backups: []backupRes{
{bup: backupAssist("ro", backupWithTime(baseTime, bupCurrent()))},
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupAssist("ro", backupWithTime(baseTime.Add(time.Minute), bupCurrent3()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime, bupCurrent()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Second), bupCurrent2()))},
{bup: backupWithResource("ro", true, backupWithTime(baseTime.Add(time.Minute), bupCurrent3()))},
},
time: baseTime.Add(48 * time.Hour),
buffer: 24 * time.Hour,