corso/src/internal/kopia/cleanup_backups.go
Abin Simon be59928f98
Fix cases where we had a trailing comma (#4208)
Not sure if we wanna merge it as it might generate way too many conflicts, but this should help us add a linter in CI. If we are good, I'll add something that can do lints for this in a follow up PR.

Super hacky, but this fix was created using `while true ; do tree-grepper -q go '(argument_list "," @nope .)' | tail -n1| awk -F: "{print \$1,\"+\"\$2\" -c ':norm \$xJZZ'\"}" | xargs vim ; done`.

---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

<!--- Please check the type of change your PR introduces: --->
- [ ] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

<!-- Can reference multiple issues. Use one of the following "magic words" - "closes, fixes" to auto-close the Github issue. -->
* https://github.com/alcionai/corso/issues/3654

#### Test Plan

<!-- How will this be tested prior to merging.-->
- [ ] 💪 Manual
- [ ]  Unit test
- [ ] 💚 E2E
2023-09-08 17:10:29 +00:00

452 lines
16 KiB
Go

package kopia
import (
"context"
"errors"
"strings"
"time"
"github.com/alcionai/clues"
"github.com/kopia/kopia/repo/manifest"
"github.com/kopia/kopia/snapshot"
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
"github.com/alcionai/corso/src/internal/data"
"github.com/alcionai/corso/src/internal/model"
"github.com/alcionai/corso/src/pkg/backup"
"github.com/alcionai/corso/src/pkg/logger"
"github.com/alcionai/corso/src/pkg/store"
)
const (
serviceCatTagPrefix = "sc-"
kopiaPathLabel = "path"
tenantTag = "tenant"
)
// cleanupOrphanedData uses bs and mf to lookup all models/snapshots for backups
// and deletes items that are older than nowFunc() - gcBuffer (cutoff) that are
// not "complete" backups with:
// - a backup model
// - an item data snapshot
// - a details snapshot or details model
//
// We exclude all items younger than the cutoff to add some buffer so that even
// if this is run concurrently with a backup it's not likely to delete models
// just being created. For example, if there was no buffer period and this is
// run when another corso instance has created an item data snapshot but hasn't
// yet created the details snapshot or the backup model it would result in this
// instance of corso marking the newly created item data snapshot for deletion
// because it appears orphaned.
//
// The buffer duration should be longer than the difference in creation times
// between the first item data snapshot/details/backup model made during a
// backup operation and the last.
//
// We don't have hard numbers on the time right now, but if the order of
// persistence is (item data snapshot, details snapshot, backup model) it should
// be faster than creating the snapshot itself and probably happens O(minutes)
// or O(hours) instead of O(days). Of course, that assumes a non-adversarial
// setup where things such as machine hiberation, process freezing (i.e. paused
// at the OS level), etc. don't occur.
func cleanupOrphanedData(
ctx context.Context,
bs store.Storer,
mf manifestFinder,
gcBuffer time.Duration,
nowFunc func() time.Time,
) error {
// Get all snapshot manifests.
snaps, err := mf.FindManifests(
ctx,
map[string]string{
manifest.TypeLabelKey: snapshot.ManifestType,
})
if err != nil {
return clues.Wrap(err, "getting snapshots")
}
var (
// deets is a hash set of the ModelStoreID or snapshot IDs for backup
// details. It contains the IDs for both legacy details stored in the model
// store and newer details stored as a snapshot because it doesn't matter
// what the storage format is. We only need to know the ID so we can:
// 1. check if there's a corresponding backup for them
// 2. delete the details if they're orphaned
deets = map[manifest.ID]struct{}{}
// dataSnaps is a hash map of the snapshot IDs for item data snapshots.
dataSnaps = map[manifest.ID]*manifest.EntryMetadata{}
// toDelete is the set of objects to delete from kopia. It starts out with
// all items and has ineligible items removed from it.
toDelete = map[manifest.ID]struct{}{}
)
cutoff := nowFunc().Add(-gcBuffer)
// Sort all the snapshots as either details snapshots or item data snapshots.
for _, snap := range snaps {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(snap.ModTime) {
continue
}
toDelete[snap.ID] = struct{}{}
k, _ := makeTagKV(TagBackupCategory)
if _, ok := snap.Labels[k]; ok {
dataSnaps[snap.ID] = snap
continue
}
deets[snap.ID] = struct{}{}
}
// Get all legacy backup details models. The initial version of backup delete
// didn't seem to delete them so they may also be orphaned if the repo is old
// enough.
deetsModels, err := bs.GetIDsForType(ctx, model.BackupDetailsSchema, nil)
if err != nil {
return clues.Wrap(err, "getting legacy backup details")
}
for _, d := range deetsModels {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(d.ModTime) {
continue
}
deets[d.ModelStoreID] = struct{}{}
toDelete[d.ModelStoreID] = struct{}{}
}
// Get all backup models.
bups, err := bs.GetIDsForType(ctx, model.BackupSchema, nil)
if err != nil {
return clues.Wrap(err, "getting all backup models")
}
var (
// assistBackups is the set of backups that have a
// * label denoting they're an assist backup
// * item data snapshot
// * details snapshot
assistBackups []*backup.Backup
// mostRecentMergeBase maps the reason to its most recent merge base's
// creation time. The map key is created using keysForBackup.
mostRecentMergeBase = map[string]time.Time{}
)
for _, bup := range bups {
// Don't even try to see if this needs garbage collected because it's not
// old enough and may correspond to an in-progress operation.
if !cutoff.After(bup.ModTime) {
continue
}
toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{}
bm := backup.Backup{}
if err := bs.GetWithModelStoreID(
ctx,
model.BackupSchema,
bup.ModelStoreID,
&bm); err != nil {
if !errors.Is(err, data.ErrNotFound) {
return clues.Wrap(err, "getting backup model").
With("search_backup_id", bup.ID)
}
// Probably safe to continue if the model wasn't found because that means
// that the possible item data and details for the backup are now
// orphaned. They'll be deleted since we won't remove them from the delete
// set.
//
// The fact that we exclude all items younger than the cutoff should
// already exclude items that are from concurrent corso backup operations.
//
// This isn't expected to really pop up, but it's possible if this
// function is run concurrently with either a backup delete or another
// instance of this function.
logger.Ctx(ctx).Infow(
"backup model not found",
"search_backup_id", bup.ModelStoreID)
continue
}
ssid := bm.StreamStoreID
if len(ssid) == 0 {
ssid = bm.DetailsID
}
d, dataOK := dataSnaps[manifest.ID(bm.SnapshotID)]
_, deetsOK := deets[manifest.ID(ssid)]
// All data is present, we shouldn't garbage collect this backup.
if deetsOK && dataOK {
delete(toDelete, bup.ModelStoreID)
delete(toDelete, manifest.ID(bm.SnapshotID))
delete(toDelete, manifest.ID(ssid))
// This is a little messy to have, but can simplify the logic below.
// The state of tagging in corso isn't all that great right now and we'd
// really like to consolidate tags and clean them up. For now, we're
// going to copy tags that are related to Reasons for a backup from the
// item data snapshot to the backup model. This makes the function
// checking if assist backups should be garbage collected a bit easier
// because now they only have to source data from backup models.
if err := transferTags(d, &bm); err != nil {
logger.CtxErr(ctx, err).Infow(
"transferring legacy tags to backup model",
"snapshot_id", d.ID,
"backup_id", bup.ID)
// Continuing here means the base won't be eligible for old assist
// base garbage collection or as a newer merge base timestamp.
//
// We could add more logic to eventually delete the base if it's an
// assist base. If it's a merge base then it should be mostly harmless
// as a newer merge base should cause older assist bases to be garbage
// collected.
//
// Either way, I don't really expect to see failures when transferring
// tags so not worth adding extra code for unless we see it become a
// problem.
continue
}
// Add to the assist backup set so that we can attempt to garbage collect
// older assist backups below.
if bup.Tags[model.BackupTypeTag] == model.AssistBackup {
assistBackups = append(assistBackups, &bm)
continue
}
// If it's a merge base track the time it was created so we can check
// later if we should remove all assist bases or not.
tags, err := keysForBackup(&bm)
if err != nil {
logger.CtxErr(ctx, err).
Info("getting Reason keys for merge base. May keep an additional assist base")
}
for _, tag := range tags {
t := mostRecentMergeBase[tag]
if t.After(bm.CreationTime) {
// Don't update the merge base time if we've already seen a newer
// merge base.
continue
}
mostRecentMergeBase[tag] = bm.CreationTime
}
}
}
logger.Ctx(ctx).Infow(
"garbage collecting orphaned items",
"num_items", len(toDelete),
"kopia_ids", maps.Keys(toDelete))
// This will technically save a superset of the assist bases we should keep.
// The reason for that is that we only add something to the set of assist
// bases after we've excluded backups in the buffer time zone. For example
// we could discover that of the set of assist bases we have, something is
// the youngest and exclude it from gabage collection. However, when looking
// at the set of all assist bases, including those in the buffer zone, it's
// possible the one we thought was the youngest actually isn't and could be
// garbage collected.
//
// This sort of edge case will ideally happen only for a few assist bases at
// a time. Assuming this function is run somewhat periodically, missing these
// edge cases is alright because they'll get picked up on a subsequent run.
assistItems := collectOldAssistBases(ctx, mostRecentMergeBase, assistBackups)
logger.Ctx(ctx).Debugw(
"garbage collecting old assist bases",
"assist_num_items", len(assistItems),
"assist_kopia_ids", assistItems)
assistItems = append(assistItems, maps.Keys(toDelete)...)
// Use single atomic batch delete operation to cleanup to keep from making a
// bunch of manifest content blobs.
if err := bs.DeleteWithModelStoreIDs(ctx, assistItems...); err != nil {
return clues.Wrap(err, "deleting orphaned data")
}
return nil
}
var skipKeys = []string{
TagBackupID,
TagBackupCategory,
}
func transferTags(snap *manifest.EntryMetadata, bup *backup.Backup) error {
tenant, err := decodeElement(snap.Labels[kopiaPathLabel])
if err != nil {
return clues.Wrap(err, "decoding tenant from label")
}
if bup.Tags == nil {
bup.Tags = map[string]string{}
}
bup.Tags[tenantTag] = tenant
skipTags := map[string]struct{}{}
for _, k := range skipKeys {
key, _ := makeTagKV(k)
skipTags[key] = struct{}{}
}
// Safe to check only this because the old field was deprecated prior to the
// tagging of assist backups and this function only deals with assist
// backups.
roid := bup.ProtectedResourceID
roidK, _ := makeTagKV(roid)
skipTags[roidK] = struct{}{}
// This is hacky, but right now we don't have a good way to get only the
// Reason tags for something. We can however, find them by searching for all
// the "normalized" tags and then discarding the ones we know aren't
// reasons. Unfortunately this won't work if custom tags are added to the
// backup that we don't know about.
//
// Convert them to the newer format that we'd like to have where the
// service/category tags have the form "sc-<service><category>".
for tag := range snap.Labels {
if _, ok := skipTags[tag]; ok || !strings.HasPrefix(tag, userTagPrefix) {
continue
}
bup.Tags[strings.Replace(tag, userTagPrefix, serviceCatTagPrefix, 1)] = "0"
}
return nil
}
// keysForBackup returns a slice of string keys representing the Reasons for this
// backup. If there's a problem creating the keys an error is returned.
func keysForBackup(bup *backup.Backup) ([]string, error) {
var (
res []string
// Safe to pull from this field since assist backups came after we switched
// to using ProtectedResourceID.
roid = bup.ProtectedResourceID
)
tenant := bup.Tags[tenantTag]
if len(tenant) == 0 {
// We can skip this backup. It won't get garbage collected, but it also
// won't result in incorrect behavior overall.
return nil, clues.New("missing tenant tag in backup").
With("backup_id", bup.ID)
}
for tag := range bup.Tags {
if strings.HasPrefix(tag, serviceCatTagPrefix) {
// Precise way we concatenate all this info doesn't really matter as
// long as it's consistent for all backups in the set and includes all
// the pieces we need to ensure uniqueness across.
fullTag := tenant + roid + tag
res = append(res, fullTag)
}
}
return res, nil
}
func collectOldAssistBases(
ctx context.Context,
mostRecentMergeBase map[string]time.Time,
bups []*backup.Backup,
) []manifest.ID {
// maybeDelete is the set of backups that could be deleted. It starts out as
// the set of all backups and has ineligible backups removed from it.
maybeDelete := map[manifest.ID]*backup.Backup{}
// Figure out which backups have overlapping reasons. A single backup can
// appear in multiple slices in the map, one for each Reason associated with
// it.
bupsByReason := map[string][]*backup.Backup{}
for _, bup := range bups {
tags, err := keysForBackup(bup)
if err != nil {
logger.CtxErr(ctx, err).Error("not checking backup for garbage collection")
continue
}
maybeDelete[manifest.ID(bup.ModelStoreID)] = bup
for _, tag := range tags {
bupsByReason[tag] = append(bupsByReason[tag], bup)
}
}
// For each set of backups we found, sort them by time. Mark all but the
// youngest backup in each group as eligible for garbage collection.
//
// We implement this process as removing backups from the set of potential
// backups to delete because it's possible for a backup to to not be the
// youngest for one Reason but be the youngest for a different Reason (i.e.
// most recent exchange mail backup but not the most recent exchange
// contacts backup). A simple delete operation in the map is sufficient to
// remove a backup even if it's only the youngest for a single Reason.
// Otherwise we'd need to do another pass after this to determine the
// isYoungest status for all Reasons in the backup.
//
// TODO(ashmrtn): Handle concurrent backups somehow? Right now backups that
// have overlapping start and end times aren't explicitly handled.
for tag, bupSet := range bupsByReason {
if len(bupSet) == 0 {
continue
}
// Sort in reverse chronological order so that we can just remove the zeroth
// item from the delete set instead of getting the slice length.
// Unfortunately this could also put us in the pathologic case where almost
// all items need swapped since in theory kopia returns results in
// chronologic order and we're processing them in the order kopia returns
// them.
slices.SortStableFunc(bupSet, func(a, b *backup.Backup) int {
return -a.CreationTime.Compare(b.CreationTime)
})
// Only remove the youngest assist base from the deletion set if we don't
// have a merge base that's younger than it. We don't need to check if the
// value is in the map here because the zero time is always at least as old
// as the times we'll see in our backups (if we see the zero time in our
// backup it's a bug but will still pass the check to keep the backup).
if t := mostRecentMergeBase[tag]; !bupSet[0].CreationTime.Before(t) {
delete(maybeDelete, manifest.ID(bupSet[0].ModelStoreID))
}
}
res := make([]manifest.ID, 0, 3*len(maybeDelete))
// For all items remaining in the delete set, generate the final set of items
// to delete. This set includes the data snapshot ID, details snapshot ID, and
// backup model ID to delete for each backup.
for bupID, bup := range maybeDelete {
// Don't need to check if we use StreamStoreID or DetailsID because
// DetailsID was deprecated prior to tagging backups as assist backups.
// Since the input set is only assist backups there's no overlap between the
// two implementations.
res = append(
res,
bupID,
manifest.ID(bup.SnapshotID),
manifest.ID(bup.StreamStoreID))
}
return res
}