Basic code for backup cleanup (#4051)

Starting code for removing item data snapshots,
backups, and backup details that have been
orphaned. Data can become orphaned through either
incomplete backup delete operations (older
corso versions) or because backups didn't
complete successfully

This code doesn't cover all cases (see TODOs in
PR) but gets a lot of the boiler-plate that will
be required. Future PRs will build on what's in
here to close the gaps

This code is not wired into any corso operations
so it cannot be run outside of unit tests

---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [x] 🕐 Yes, but in a later PR
- [ ]  No

#### Type of change

- [x] 🌻 Feature
- [ ] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [ ] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Issue(s)

* #3217

#### Test Plan

- [x] 💪 Manual
- [ ]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-08-21 15:40:47 -07:00 committed by GitHub
parent 6a83d23ff2
commit 6963f63f4f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 609 additions and 11 deletions

View File

@ -115,14 +115,6 @@ func (me ManifestEntry) GetTag(key string) (string, bool) {
return v, ok return v, ok
} }
type snapshotManager interface {
FindManifests(
ctx context.Context,
tags map[string]string,
) ([]*manifest.EntryMetadata, error)
LoadSnapshot(ctx context.Context, id manifest.ID) (*snapshot.Manifest, error)
}
func serviceCatString(s path.ServiceType, c path.CategoryType) string { func serviceCatString(s path.ServiceType, c path.CategoryType) string {
return s.String() + c.String() return s.String() + c.String()
} }

View File

@ -0,0 +1,156 @@
package kopia
import (
"context"
"errors"
"github.com/alcionai/clues"
"github.com/kopia/kopia/repo/manifest"
"github.com/kopia/kopia/snapshot"
"golang.org/x/exp/maps"
"github.com/alcionai/corso/src/internal/data"
"github.com/alcionai/corso/src/internal/model"
"github.com/alcionai/corso/src/pkg/backup"
"github.com/alcionai/corso/src/pkg/logger"
"github.com/alcionai/corso/src/pkg/store"
)
func cleanupOrphanedData(
ctx context.Context,
bs store.Storer,
mf manifestFinder,
) error {
// Get all snapshot manifests.
snaps, err := mf.FindManifests(
ctx,
map[string]string{
manifest.TypeLabelKey: snapshot.ManifestType,
})
if err != nil {
return clues.Wrap(err, "getting snapshots")
}
var (
// deets is a hash set of the ModelStoreID or snapshot IDs for backup
// details. It contains the IDs for both legacy details stored in the model
// store and newer details stored as a snapshot because it doesn't matter
// what the storage format is. We only need to know the ID so we can:
// 1. check if there's a corresponding backup for them
// 2. delete the details if they're orphaned
deets = map[manifest.ID]struct{}{}
// dataSnaps is a hash set of the snapshot IDs for item data snapshots.
dataSnaps = map[manifest.ID]struct{}{}
)
// TODO(ashmrtn): Exclude all snapshots and details younger than X <duration>.
// Doing so adds some buffer so that even if this is run concurrently with a
// backup it's not likely to delete models just being created. For example,
// running this when another corso instance has created an item data snapshot
// but hasn't yet created the details snapshot or the backup model would
// result in this instance of corso marking the newly created item data
// snapshot for deletion because it appears orphaned.
//
// Excluding only snapshots and details models works for now since the backup
// model is the last thing persisted out of them. If we switch the order of
// persistence then this will need updated as well.
//
// The buffer duration should be longer than the time it would take to do
// details merging and backup model creation. We don't have hard numbers on
// that, but it should be faster than creating the snapshot itself and
// probably happens O(minutes) or O(hours) instead of O(days). Of course, that
// assumes a non-adversarial setup where things such as machine hiberation,
// process freezing (i.e. paused at the OS level), etc. don't occur.
// Sort all the snapshots as either details snapshots or item data snapshots.
for _, snap := range snaps {
k, _ := makeTagKV(TagBackupCategory)
if _, ok := snap.Labels[k]; ok {
dataSnaps[snap.ID] = struct{}{}
continue
}
deets[snap.ID] = struct{}{}
}
// Get all legacy backup details models. The initial version of backup delete
// didn't seem to delete them so they may also be orphaned if the repo is old
// enough.
deetsModels, err := bs.GetIDsForType(ctx, model.BackupDetailsSchema, nil)
if err != nil {
return clues.Wrap(err, "getting legacy backup details")
}
for _, d := range deetsModels {
deets[d.ModelStoreID] = struct{}{}
}
// Get all backup models.
bups, err := bs.GetIDsForType(ctx, model.BackupSchema, nil)
if err != nil {
return clues.Wrap(err, "getting all backup models")
}
toDelete := maps.Clone(deets)
maps.Copy(toDelete, dataSnaps)
for _, bup := range bups {
toDelete[manifest.ID(bup.ModelStoreID)] = struct{}{}
bm := backup.Backup{}
if err := bs.GetWithModelStoreID(
ctx,
model.BackupSchema,
bup.ModelStoreID,
&bm,
); err != nil {
if !errors.Is(err, data.ErrNotFound) {
return clues.Wrap(err, "getting backup model").
With("search_backup_id", bup.ID)
}
// TODO(ashmrtn): This actually needs revised, see above TODO. Leaving it
// here for the moment to get the basic logic in.
//
// Safe to continue if the model wasn't found because that means that the
// possible item data and details for the backup are now orphaned. They'll
// be deleted since we won't remove them from the delete set.
//
// This isn't expected to really pop up, but it's possible if this
// function is run concurrently with either a backup delete or another
// instance of this function.
logger.Ctx(ctx).Debugw(
"backup model not found",
"search_backup_id", bup.ModelStoreID)
continue
}
ssid := bm.StreamStoreID
if len(ssid) == 0 {
ssid = bm.DetailsID
}
_, dataOK := dataSnaps[manifest.ID(bm.SnapshotID)]
_, deetsOK := deets[manifest.ID(ssid)]
// All data is present, we shouldn't garbage collect this backup.
if deetsOK && dataOK {
delete(toDelete, bup.ModelStoreID)
delete(toDelete, manifest.ID(bm.SnapshotID))
delete(toDelete, manifest.ID(ssid))
}
}
// Use single atomic batch delete operation to cleanup to keep from making a
// bunch of manifest content blobs.
if err := bs.DeleteWithModelStoreIDs(ctx, maps.Keys(toDelete)...); err != nil {
return clues.Wrap(err, "deleting orphaned data")
}
// TODO(ashmrtn): Do some pruning of assist backup models so we don't keep
// them around forever.
return nil
}

View File

@ -0,0 +1,433 @@
package kopia
import (
"context"
"fmt"
"testing"
"github.com/alcionai/clues"
"github.com/kopia/kopia/repo/manifest"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
"github.com/alcionai/corso/src/internal/data"
"github.com/alcionai/corso/src/internal/model"
"github.com/alcionai/corso/src/internal/tester"
"github.com/alcionai/corso/src/pkg/backup"
)
type BackupCleanupUnitSuite struct {
tester.Suite
}
func TestBackupCleanupUnitSuite(t *testing.T) {
suite.Run(t, &BackupCleanupUnitSuite{Suite: tester.NewUnitSuite(t)})
}
type mockManifestFinder struct {
t *testing.T
manifests []*manifest.EntryMetadata
err error
}
func (mmf mockManifestFinder) FindManifests(
ctx context.Context,
tags map[string]string,
) ([]*manifest.EntryMetadata, error) {
assert.Equal(
mmf.t,
map[string]string{"type": "snapshot"},
tags,
"snapshot search tags")
return mmf.manifests, clues.Stack(mmf.err).OrNil()
}
type mockStorer struct {
t *testing.T
details []*model.BaseModel
detailsErr error
backups []backupRes
backupListErr error
expectDeleteIDs []manifest.ID
deleteErr error
}
func (ms mockStorer) Delete(context.Context, model.Schema, model.StableID) error {
return clues.New("not implemented")
}
func (ms mockStorer) Get(context.Context, model.Schema, model.StableID, model.Model) error {
return clues.New("not implemented")
}
func (ms mockStorer) Put(context.Context, model.Schema, model.Model) error {
return clues.New("not implemented")
}
func (ms mockStorer) Update(context.Context, model.Schema, model.Model) error {
return clues.New("not implemented")
}
func (ms mockStorer) GetIDsForType(
_ context.Context,
s model.Schema,
tags map[string]string,
) ([]*model.BaseModel, error) {
assert.Empty(ms.t, tags, "model search tags")
switch s {
case model.BackupDetailsSchema:
return ms.details, clues.Stack(ms.detailsErr).OrNil()
case model.BackupSchema:
var bases []*model.BaseModel
for _, b := range ms.backups {
bases = append(bases, &b.bup.BaseModel)
}
return bases, clues.Stack(ms.backupListErr).OrNil()
}
return nil, clues.New(fmt.Sprintf("unknown type: %s", s.String()))
}
func (ms mockStorer) GetWithModelStoreID(
_ context.Context,
s model.Schema,
id manifest.ID,
m model.Model,
) error {
assert.Equal(ms.t, model.BackupSchema, s, "model get schema")
d := m.(*backup.Backup)
for _, b := range ms.backups {
if id == b.bup.ModelStoreID {
*d = *b.bup
return clues.Stack(b.err).OrNil()
}
}
return clues.Stack(data.ErrNotFound)
}
func (ms mockStorer) DeleteWithModelStoreIDs(
_ context.Context,
ids ...manifest.ID,
) error {
assert.ElementsMatch(ms.t, ms.expectDeleteIDs, ids, "model delete IDs")
return clues.Stack(ms.deleteErr).OrNil()
}
// backupRes represents an individual return value for an item in GetIDsForType
// or the result of GetWithModelStoreID. err is used for GetWithModelStoreID
// only.
type backupRes struct {
bup *backup.Backup
err error
}
func (suite *BackupCleanupUnitSuite) TestCleanupOrphanedData() {
backupTag, _ := makeTagKV(TagBackupCategory)
// Current backup and snapshots.
bupCurrent := &backup.Backup{
BaseModel: model.BaseModel{
ID: model.StableID("current-bup-id"),
ModelStoreID: manifest.ID("current-bup-msid"),
},
SnapshotID: "current-snap-msid",
StreamStoreID: "current-deets-msid",
}
snapCurrent := &manifest.EntryMetadata{
ID: "current-snap-msid",
Labels: map[string]string{
backupTag: "0",
},
}
deetsCurrent := &manifest.EntryMetadata{
ID: "current-deets-msid",
}
// Legacy backup with details in separate model.
bupLegacy := &backup.Backup{
BaseModel: model.BaseModel{
ID: model.StableID("legacy-bup-id"),
ModelStoreID: manifest.ID("legacy-bup-msid"),
},
SnapshotID: "legacy-snap-msid",
DetailsID: "legacy-deets-msid",
}
snapLegacy := &manifest.EntryMetadata{
ID: "legacy-snap-msid",
Labels: map[string]string{
backupTag: "0",
},
}
deetsLegacy := &model.BaseModel{
ID: "legacy-deets-id",
ModelStoreID: "legacy-deets-msid",
}
// Incomplete backup missing data snapshot.
bupNoSnapshot := &backup.Backup{
BaseModel: model.BaseModel{
ID: model.StableID("ns-bup-id"),
ModelStoreID: manifest.ID("ns-bup-id-msid"),
},
StreamStoreID: "ns-deets-msid",
}
deetsNoSnapshot := &manifest.EntryMetadata{
ID: "ns-deets-msid",
}
// Legacy incomplete backup missing data snapshot.
bupLegacyNoSnapshot := &backup.Backup{
BaseModel: model.BaseModel{
ID: model.StableID("ns-legacy-bup-id"),
ModelStoreID: manifest.ID("ns-legacy-bup-id-msid"),
},
DetailsID: "ns-legacy-deets-msid",
}
deetsLegacyNoSnapshot := &model.BaseModel{
ID: "ns-legacy-deets-id",
ModelStoreID: "ns-legacy-deets-msid",
}
// Incomplete backup missing details.
bupNoDetails := &backup.Backup{
BaseModel: model.BaseModel{
ID: model.StableID("nssid-bup-id"),
ModelStoreID: manifest.ID("nssid-bup-msid"),
},
SnapshotID: "nssid-snap-msid",
}
snapNoDetails := &manifest.EntryMetadata{
ID: "nssid-snap-msid",
Labels: map[string]string{
backupTag: "0",
},
}
table := []struct {
name string
snapshots []*manifest.EntryMetadata
snapshotFetchErr error
// only need BaseModel here since we never look inside the details items.
detailsModels []*model.BaseModel
detailsModelListErr error
backups []backupRes
backupListErr error
deleteErr error
expectDeleteIDs []manifest.ID
expectErr assert.ErrorAssertionFunc
}{
{
name: "EmptyRepo",
expectErr: assert.NoError,
},
{
name: "OnlyCompleteBackups Noops",
snapshots: []*manifest.EntryMetadata{
snapCurrent,
deetsCurrent,
snapLegacy,
},
detailsModels: []*model.BaseModel{
deetsLegacy,
},
backups: []backupRes{
{bup: bupCurrent},
{bup: bupLegacy},
},
expectErr: assert.NoError,
},
{
name: "MissingFieldsInBackup CausesCleanup",
snapshots: []*manifest.EntryMetadata{
snapNoDetails,
deetsNoSnapshot,
},
detailsModels: []*model.BaseModel{
deetsLegacyNoSnapshot,
},
backups: []backupRes{
{bup: bupNoSnapshot},
{bup: bupLegacyNoSnapshot},
{bup: bupNoDetails},
},
expectDeleteIDs: []manifest.ID{
manifest.ID(bupNoSnapshot.ModelStoreID),
manifest.ID(bupLegacyNoSnapshot.ModelStoreID),
manifest.ID(bupNoDetails.ModelStoreID),
manifest.ID(deetsLegacyNoSnapshot.ModelStoreID),
snapNoDetails.ID,
deetsNoSnapshot.ID,
},
expectErr: assert.NoError,
},
{
name: "MissingSnapshot CausesCleanup",
snapshots: []*manifest.EntryMetadata{
deetsCurrent,
},
detailsModels: []*model.BaseModel{
deetsLegacy,
},
backups: []backupRes{
{bup: bupCurrent},
{bup: bupLegacy},
},
expectDeleteIDs: []manifest.ID{
manifest.ID(bupCurrent.ModelStoreID),
deetsCurrent.ID,
manifest.ID(bupLegacy.ModelStoreID),
manifest.ID(deetsLegacy.ModelStoreID),
},
expectErr: assert.NoError,
},
{
name: "MissingDetails CausesCleanup",
snapshots: []*manifest.EntryMetadata{
snapCurrent,
snapLegacy,
},
backups: []backupRes{
{bup: bupCurrent},
{bup: bupLegacy},
},
expectDeleteIDs: []manifest.ID{
manifest.ID(bupCurrent.ModelStoreID),
manifest.ID(bupLegacy.ModelStoreID),
snapCurrent.ID,
snapLegacy.ID,
},
expectErr: assert.NoError,
},
{
name: "SnapshotsListError Fails",
snapshotFetchErr: assert.AnError,
backups: []backupRes{
{bup: bupCurrent},
},
expectErr: assert.Error,
},
{
name: "LegacyDetailsListError Fails",
snapshots: []*manifest.EntryMetadata{
snapCurrent,
},
detailsModelListErr: assert.AnError,
backups: []backupRes{
{bup: bupCurrent},
},
expectErr: assert.Error,
},
{
name: "BackupIDsListError Fails",
snapshots: []*manifest.EntryMetadata{
snapCurrent,
deetsCurrent,
},
backupListErr: assert.AnError,
expectErr: assert.Error,
},
{
name: "BackupModelGetErrorNotFound CausesCleanup",
snapshots: []*manifest.EntryMetadata{
snapCurrent,
deetsCurrent,
snapLegacy,
snapNoDetails,
},
detailsModels: []*model.BaseModel{
deetsLegacy,
},
backups: []backupRes{
{bup: bupCurrent},
{
bup: bupLegacy,
err: data.ErrNotFound,
},
{
bup: bupNoDetails,
err: data.ErrNotFound,
},
},
// Backup IDs are still included in here because they're added to the
// deletion set prior to attempting to fetch models. The model store
// delete operation should ignore missing models though so there's no
// issue.
expectDeleteIDs: []manifest.ID{
snapLegacy.ID,
manifest.ID(deetsLegacy.ModelStoreID),
manifest.ID(bupLegacy.ModelStoreID),
snapNoDetails.ID,
manifest.ID(bupNoDetails.ModelStoreID),
},
expectErr: assert.NoError,
},
{
name: "BackupModelGetError Fails",
snapshots: []*manifest.EntryMetadata{
snapCurrent,
deetsCurrent,
snapLegacy,
snapNoDetails,
},
detailsModels: []*model.BaseModel{
deetsLegacy,
},
backups: []backupRes{
{bup: bupCurrent},
{
bup: bupLegacy,
err: assert.AnError,
},
{bup: bupNoDetails},
},
expectErr: assert.Error,
},
}
for _, test := range table {
suite.Run(test.name, func() {
t := suite.T()
ctx, flush := tester.NewContext(t)
defer flush()
mbs := mockStorer{
t: t,
details: test.detailsModels,
detailsErr: test.detailsModelListErr,
backups: test.backups,
backupListErr: test.backupListErr,
expectDeleteIDs: test.expectDeleteIDs,
deleteErr: test.deleteErr,
}
mmf := mockManifestFinder{
t: t,
manifests: test.snapshots,
err: test.snapshotFetchErr,
}
err := cleanupOrphanedData(ctx, mbs, mmf)
test.expectErr(t, err, clues.ToCore(err))
})
}
}

View File

@ -52,9 +52,26 @@ var (
} }
) )
type snapshotLoader interface { type (
SnapshotRoot(man *snapshot.Manifest) (fs.Entry, error) manifestFinder interface {
} FindManifests(
ctx context.Context,
tags map[string]string,
) ([]*manifest.EntryMetadata, error)
}
snapshotManager interface {
manifestFinder
LoadSnapshot(
ctx context.Context,
id manifest.ID,
) (*snapshot.Manifest, error)
}
snapshotLoader interface {
SnapshotRoot(man *snapshot.Manifest) (fs.Entry, error)
}
)
var ( var (
_ snapshotManager = &conn{} _ snapshotManager = &conn{}