Compare commits

...

13 Commits

Author SHA1 Message Date
Ashlie Martinez
7dddc97bbc Fixup file names 2023-03-31 15:11:33 -07:00
Ashlie Martinez
8a7e607474 Fucntion to decode manifests with custom code 2023-03-31 14:33:29 -07:00
Ashlie Martinez
e3dbbc546a Just use errors package for portability 2023-03-31 12:41:17 -07:00
Ashlie Martinez
e02cbfdb73 Code to generate manifest entries
Uses random data for most fields.
2023-03-31 12:40:34 -07:00
Ashlie Martinez
0e74d15259 Add structs for kopia manifests 2023-03-31 12:32:10 -07:00
Vaibhav Kamra
fb08c2374e Add benchmark tests 2023-03-31 10:29:40 -07:00
Vaibhav Kamra
7f91344fda Handle all manifest entry fields 2023-03-31 00:34:28 -07:00
Vaibhav Kamra
62bfed94d6 Change struct to include RawMessage and update jsonparser 2023-03-30 20:47:43 -07:00
Ashlie Martinez
dddbd36969 Update to new struct layout
Also cleanup the code so it's a bit more linear.
2023-03-30 14:01:46 -07:00
Vaibhav Kamra
2bc40b4a39 Jsonparser prototype 2023-03-30 13:39:18 -07:00
Ashlie Martinez
f1b65c9f8b Test program of custom JSON array decoder 2023-03-30 12:43:33 -07:00
Ashlie Martinez
4316136de8 Basic helper to stream a json array
Uses regular decode to decode actual object data for items in the array.
2023-03-30 12:42:53 -07:00
Ashlie Martinez
f00970493d Helper programs to repro JSON deserialize mem use
JSON deserialize of arrays is inefficient for memory. This is a minimal
reproducer to show that it uses lots more memory than expected.

Build each program separately, run gen and then you can run goread for
the repro and some memory usage stats.
2023-03-30 10:41:56 -07:00
13 changed files with 762 additions and 0 deletions

View File

@ -0,0 +1,33 @@
package main
import (
"fmt"
"os"
"github.com/alcionai/corso/src/cmd/jsondebug/common"
"github.com/alcionai/corso/src/cmd/jsondebug/decoder"
)
func main() {
readData()
}
func readData() {
f, err := os.Open(common.FileName)
if err != nil {
fmt.Printf("Error opening input file: %v\n", err)
return
}
defer f.Close()
output, err := decoder.DecodeFooArray(f)
if err != nil {
fmt.Printf("Error decoding input: %v\n", err)
return
}
common.PrintMemUsage()
fmt.Printf("got array with %d items\n", len(output.Entries))
}

View File

@ -0,0 +1,9 @@
package main
import "testing"
func Benchmark_readData(b *testing.B) {
for i := 0; i < b.N; i++ {
readData()
}
}

View File

@ -0,0 +1,115 @@
package common
import (
"encoding/json"
"fmt"
"runtime"
"time"
)
const (
NumItems = 300000
ItemSize = 1024
GzipSuffix = ".gz"
FileName = "input.json"
)
var ManifestFileName = fmt.Sprintf("manifest-input.%d.json", NumItems)
type FooArray struct {
Entries []*Foo `json:"entries"`
}
type Foo struct {
ID string `json:"id"`
Labels map[string]string `json:"labels"`
ModTime time.Time `json:"modified"`
Deleted bool `json:"deleted,omitempty"`
Content json.RawMessage `json:"data"`
}
type Content struct {
ID string `json:"id"`
Data []byte `json:"data"`
}
type Manifest struct {
Entries []*ManifestEntry `json:"entries"`
}
type ManifestEntry struct {
ID string `json:"id"`
Labels map[string]string `json:"labels"`
ModTime time.Time `json:"modified"`
Deleted bool `json:"deleted,omitempty"`
Content json.RawMessage `json:"data"`
}
type SnapManifest struct {
ID string `json:"id"`
Source SourceInfo `json:"source"`
Description string `json:"description"`
StartTime int64 `json:"startTime"`
EndTime int64 `json:"endTime"`
Stats StatsS `json:"stats,omitempty"`
IncompleteReason string `json:"incomplete,omitempty"`
RootEntry *DirEntry `json:"rootEntry"`
Tags map[string]string `json:"tags,omitempty"`
}
type SourceInfo struct {
Host string `json:"host"`
UserName string `json:"userName"`
Path string `json:"path"`
}
type StatsS struct {
TotalFileSize int64 `json:"totalSize"`
ExcludedTotalFileSize int64 `json:"excludedTotalSize"`
TotalFileCount int32 `json:"fileCount"`
CachedFiles int32 `json:"cachedFiles"`
NonCachedFiles int32 `json:"nonCachedFiles"`
TotalDirectoryCount int32 `json:"dirCount"`
ExcludedFileCount int32 `json:"excludedFileCount"`
ExcludedDirCount int32 `json:"excludedDirCount"`
IgnoredErrorCount int32 `json:"ignoredErrorCount"`
ErrorCount int32 `json:"errorCount"`
}
type DirEntry struct {
Name string `json:"name,omitempty"`
EntryType string `json:"type,omitempty"`
Permissions int `json:"mode,omitempty"`
FileSize int64 `json:"size,omitempty"`
ModTime int64 `json:"mtime,omitempty"`
UserID int32 `json:"uid,omitempty"`
GroupID int32 `json:"gid,omitempty"`
ObjectID string `json:"obj,omitempty"`
DirSummary *DirectorySummary `json:"summ,omitempty"`
}
type DirectorySummary struct {
TotalFileSize int64 `json:"size"`
TotalFileCount int64 `json:"files"`
TotalSymlinkCount int64 `json:"symlinks"`
TotalDirCount int64 `json:"dirs"`
MaxModTime int64 `json:"maxTime"`
IncompleteReason string `json:"incomplete,omitempty"`
FatalErrorCount int `json:"numFailed"`
IgnoredErrorCount int `json:"numIgnoredErrors,omitempty"`
}
func PrintMemUsage() {
var m runtime.MemStats
runtime.ReadMemStats(&m)
// For info on each, see: https://golang.org/pkg/runtime/#MemStats
fmt.Printf("Alloc = %v MiB", bToMb(m.Alloc))
fmt.Printf("\tTotalAlloc = %v MiB", bToMb(m.TotalAlloc))
fmt.Printf("\tSys = %v MiB", bToMb(m.Sys))
fmt.Printf("\tNumGC = %v\n", m.NumGC)
}
func bToMb(b uint64) uint64 {
return b / 1024 / 1024
}

View File

@ -0,0 +1,131 @@
package decoder
import (
"encoding/json"
"io"
"github.com/pkg/errors"
"github.com/alcionai/corso/src/cmd/jsondebug/common"
)
const (
objectOpen = "{"
objectClose = "}"
arrayOpen = "["
arrayClose = "]"
)
var errEOF = errors.New("unexpected end of input")
func expectDelimToken(dec *json.Decoder, expectedToken string) error {
t, err := dec.Token()
if err == io.EOF {
return errors.WithStack(errEOF)
} else if err != nil {
return errors.Wrap(err, "reading JSON token")
}
d, ok := t.(json.Delim)
if !ok {
return errors.Errorf("unexpected token: (%T) %v", t, t)
} else if d.String() != expectedToken {
return errors.Errorf(
"unexpected token; wanted %s, got %s",
expectedToken,
d,
)
}
return nil
}
func stringToken(dec *json.Decoder) (string, error) {
t, err := dec.Token()
if errors.Is(err, io.EOF) {
return "", errors.WithStack(errEOF)
} else if err != nil {
return "", errors.Wrap(err, "reading JSON token")
}
l, ok := t.(string)
if !ok {
return "", errors.Errorf("unexpected token (%T) %v; wanted field name", t, t)
}
return l, nil
}
func DecodeFooArray(r io.Reader) (common.FooArray, error) {
var (
dec = json.NewDecoder(r)
res = common.FooArray{}
)
if err := expectDelimToken(dec, objectOpen); err != nil {
return res, err
}
// Need to manually decode fields here since we can't reuse the stdlib
// decoder due to memory issues.
if err := parseFields(dec, &res); err != nil {
return res, err
}
// Consumes closing object curly brace after we're done. Don't need to check
// for EOF because json.Decode only guarantees decoding the next JSON item in
// the stream so this follows that.
return res, expectDelimToken(dec, objectClose)
}
func parseFields(dec *json.Decoder, res *common.FooArray) error {
for dec.More() {
t, err := dec.Token()
if err == io.EOF {
return errors.WithStack(errEOF)
} else if err != nil {
return errors.Wrap(err, "reading JSON token")
}
l, ok := t.(string)
if !ok {
return errors.Errorf(
"unexpected token (%T) %v; wanted field name",
t,
t,
)
}
// Only have `entries` field right now. Needs to match the JSON tag for the
// struct.
if l != "entries" {
return errors.Errorf("unexpected field name %s", l)
}
if err = decodeArray(dec, &res.Entries); err != nil {
return err
}
}
return nil
}
func decodeArray[T any](dec *json.Decoder, output *[]T) error {
// Consume starting bracket.
if err := expectDelimToken(dec, arrayOpen); err != nil {
return err
}
// Read elements.
for dec.More() {
tmp := *new(T)
if err := dec.Decode(&tmp); err != nil {
return errors.Wrap(err, "decoding array element")
}
*output = append(*output, tmp)
}
// Consume ending bracket.
return expectDelimToken(dec, arrayClose)
}

View File

@ -0,0 +1,59 @@
package decoder
import (
"encoding/json"
"io"
"github.com/pkg/errors"
"github.com/alcionai/corso/src/cmd/jsondebug/common"
)
func DecodeManifestArray(r io.Reader) (common.Manifest, error) {
var (
dec = json.NewDecoder(r)
res = common.Manifest{}
)
if err := expectDelimToken(dec, objectOpen); err != nil {
return res, err
}
// Need to manually decode fields here since we can't reuse the stdlib
// decoder due to memory issues.
if err := parseManifestFields(dec, &res); err != nil {
return res, err
}
// Consumes closing object curly brace after we're done. Don't need to check
// for EOF because json.Decode only guarantees decoding the next JSON item in
// the stream so this follows that.
return res, expectDelimToken(dec, objectClose)
}
func parseManifestFields(dec *json.Decoder, res *common.Manifest) error {
var seen bool
for dec.More() {
l, err := stringToken(dec)
if err != nil {
return err
}
// Only have `entries` field right now. This is stricter than the current
// JSON decoder in the stdlib.
if l != "entries" {
return errors.Errorf("unexpected field name %s", l)
} else if seen {
return errors.New("repeated Entries field")
}
seen = true
if err := decodeArray(dec, &res.Entries); err != nil {
return err
}
}
return nil
}

View File

@ -0,0 +1,61 @@
package main
import (
"crypto/rand"
"encoding/json"
"fmt"
"os"
"time"
"github.com/alcionai/corso/src/cmd/jsondebug/common"
"github.com/google/uuid"
)
func main() {
buf := make([]byte, common.ItemSize)
data := &common.FooArray{
Entries: make([]*common.Foo, 0, common.NumItems),
}
for i := 0; i < common.NumItems; i++ {
n, err := rand.Read(buf)
if err != nil {
fmt.Printf("Error reading random data: %v\n", err)
return
} else if n != common.ItemSize {
fmt.Printf(
"Short read for item data: wanted %d, got %d\n",
common.ItemSize,
n,
)
return
}
content := common.Content{
ID: uuid.NewString(),
Data: buf,
}
payload, _ := json.Marshal(content)
item := common.Foo{
ID: uuid.NewString(),
Labels: map[string]string{"foo": "bar"},
ModTime: time.Now(),
Content: payload,
}
data.Entries = append(data.Entries, &item)
}
f, err := os.Create(common.FileName)
if err != nil {
fmt.Printf("Error making output file: %v\n", err)
return
}
defer f.Close()
enc := json.NewEncoder(f)
if err := enc.Encode(data); err != nil {
fmt.Printf("Error writing json to file: %v\n", err)
return
}
}

View File

@ -0,0 +1,34 @@
package main
import (
"encoding/json"
"fmt"
"os"
"github.com/alcionai/corso/src/cmd/jsondebug/common"
)
func main() {
readData()
}
func readData() {
f, err := os.Open(common.FileName)
if err != nil {
fmt.Printf("Error opening input file: %v\n", err)
return
}
defer f.Close()
dec := json.NewDecoder(f)
output := common.FooArray{}
if err := dec.Decode(&output); err != nil {
fmt.Printf("Error decoding input: %v\n", err)
return
}
common.PrintMemUsage()
}

View File

@ -0,0 +1,9 @@
package main
import "testing"
func Benchmark_readData(b *testing.B) {
for i := 0; i < b.N; i++ {
readData()
}
}

View File

@ -0,0 +1,117 @@
package main
import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"os"
"runtime"
"runtime/pprof"
"github.com/alcionai/corso/src/cmd/jsondebug/common"
"github.com/buger/jsonparser"
)
func main() {
defer func() {
common.PrintMemUsage()
f, err := os.Create("mem.prof")
if err != nil {
fmt.Print("could not create memory profile: ", err)
return
}
defer f.Close() // error handling omitted for example
runtime.GC() // get up-to-date statistics
if err := pprof.WriteHeapProfile(f); err != nil {
fmt.Print("could not write memory profile: ", err)
return
}
}()
d, err := readFile()
if err != nil {
return
}
parseData(d)
}
func readFile() ([]byte, error) {
common.PrintMemUsage()
data, err := ioutil.ReadFile(common.FileName)
if err != nil {
fmt.Printf("Error reading file: %v\n", err)
return nil, err
}
return data, nil
}
func parseData(data []byte) {
common.PrintMemUsage()
output := common.FooArray{
Entries: []*common.Foo{},
}
_ = output
// var handler func([]byte, []byte, jsonparser.ValueType, int) error
// handler := func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) error {
// fmt.Printf("Key: '%s'\n Value: '%s'\n Type: %s\n", string(key), string(value), dataType)
// return nil
// }
jsonparser.ArrayEach(data, func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
e, errInner := getEntry(value)
if errInner != nil {
fmt.Printf("Error decoding input2: %v\n", errInner)
return
}
output.Entries = append(output.Entries, e)
}, "entries")
common.PrintMemUsage()
fmt.Printf("Decoded %d entries\n", len(output.Entries))
}
func getEntry(data []byte) (*common.Foo, error) {
e := &common.Foo{}
jsonparser.ObjectEach(data, func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) error {
switch string(key) {
case "id":
e.ID = string(value)
case "labels":
err := json.Unmarshal(value, &e.Labels)
if err != nil {
return fmt.Errorf("unmarshalling labels: %w", err)
}
case "modified":
err := json.Unmarshal(value, &e.ModTime)
if err != nil {
return fmt.Errorf("unmarshalling modtime: %w", err)
}
case "deleted":
err := json.Unmarshal(value, &e.Deleted)
if err != nil {
return fmt.Errorf("unmarshalling deleted: %w", err)
}
case "data":
cpBuf := make([]byte, len(value))
_ = copy(cpBuf, value)
e.Content = cpBuf
default:
fmt.Printf("Unexpected Input: %v\n", key)
return errors.New("Unexpected Input: " + string(key))
}
return nil
})
return e, nil
}

View File

@ -0,0 +1,14 @@
package main
import "testing"
func Benchmark_parseData(b *testing.B) {
d, err := readFile()
if err != nil {
return
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
parseData(d)
}
}

View File

@ -0,0 +1,177 @@
package main
import (
"compress/gzip"
"encoding/json"
"fmt"
"math/rand"
"os"
"time"
"github.com/pkg/errors"
"golang.org/x/exp/maps"
"github.com/alcionai/corso/src/cmd/jsondebug/common"
)
const (
hostName = "host-name"
userName = "user-name"
)
func generateManifestEntry() (*common.ManifestEntry, error) {
snapMan := generateSnapManifest()
// Base tag set for all snapshots.
tags := map[string]string{
"type": "snapshot",
"hostname": snapMan.Source.Host,
"username": snapMan.Source.UserName,
"path": snapMan.Source.Path,
}
maps.Copy(tags, snapMan.Tags)
serializedSnapMan, err := json.Marshal(snapMan)
if err != nil {
return nil, errors.Wrap(err, "serializing inner struct")
}
res := &common.ManifestEntry{
ID: randStringLen(32),
ModTime: time.Now(),
Deleted: rand.Uint32()&1 != 0,
Labels: tags,
Content: serializedSnapMan,
}
return res, nil
}
func generateSnapManifest() common.SnapManifest {
var incomplete string
// Roughly 1/4 incomplete.
if rand.Intn(100) < 25 {
incomplete = "checkpoint"
}
path := randString()
res := common.SnapManifest{
Source: common.SourceInfo{
Host: hostName,
UserName: userName,
Path: path,
},
StartTime: rand.Int63(),
EndTime: rand.Int63(),
Stats: common.StatsS{
TotalFileSize: rand.Int63(),
ExcludedTotalFileSize: int64(rand.Uint32()),
TotalFileCount: rand.Int31(),
CachedFiles: rand.Int31(),
NonCachedFiles: rand.Int31(),
TotalDirectoryCount: rand.Int31(),
ExcludedFileCount: rand.Int31(),
ExcludedDirCount: rand.Int31(),
IgnoredErrorCount: rand.Int31(),
ErrorCount: rand.Int31(),
},
IncompleteReason: incomplete,
RootEntry: &common.DirEntry{
Name: path,
EntryType: randStringLen(1),
Permissions: rand.Intn(512),
FileSize: rand.Int63(),
ModTime: rand.Int63(),
UserID: rand.Int31(),
GroupID: rand.Int31(),
ObjectID: randStringLen(32),
DirSummary: &common.DirectorySummary{
TotalFileSize: rand.Int63(),
TotalFileCount: rand.Int63(),
TotalSymlinkCount: rand.Int63(),
TotalDirCount: rand.Int63(),
MaxModTime: rand.Int63(),
IncompleteReason: incomplete,
FatalErrorCount: rand.Int(),
IgnoredErrorCount: rand.Int(),
},
},
Tags: map[string]string{
// User stand-in.
"tag:" + randStringLen(40): "0",
"tag:backup-id": randStringLen(36),
"tag:is-canon-backup": "0",
// Service/data type stand-in.
"tag:" + randStringLen(20): "0",
},
}
return res
}
var charSet = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789")
func randString() string {
// String lengths between [10, 128] bytes.
return randStringLen(rand.Intn(119) + 10)
}
func randStringLen(length int) string {
res := make([]rune, length)
for i := range res {
res[i] = charSet[rand.Intn(len(charSet))]
}
return string(res)
}
func main() {
data := &common.Manifest{
Entries: make([]*common.ManifestEntry, 0, common.NumItems),
}
for i := 0; i < common.NumItems; i++ {
entry, err := generateManifestEntry()
if err != nil {
fmt.Printf("Error generating random entry: %v\n", err)
return
}
data.Entries = append(data.Entries, entry)
}
f, err := os.Create(common.ManifestFileName)
if err != nil {
fmt.Printf("Error making regular output file: %v\n", err)
return
}
defer f.Close()
enc := json.NewEncoder(f)
if err := enc.Encode(data); err != nil {
fmt.Printf("Error writing json to regular file: %v\n", err)
return
}
fgz, err := os.Create(common.ManifestFileName + common.GzipSuffix)
if err != nil {
fmt.Printf("Error making gzip output file: %v\n", err)
return
}
defer fgz.Close()
gz := gzip.NewWriter(fgz)
defer gz.Close()
enc = json.NewEncoder(gz)
if err := enc.Encode(data); err != nil {
fmt.Printf("Error writing json to regular file: %v\n", err)
return
}
}

View File

@ -38,6 +38,7 @@ require (
github.com/VividCortex/ewma v1.2.0 // indirect
github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d // indirect
github.com/andybalholm/brotli v1.0.4 // indirect
github.com/buger/jsonparser v1.1.1 // indirect
github.com/dnaeon/go-vcr v1.2.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect

View File

@ -66,6 +66,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY=
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4=
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
github.com/cenkalti/backoff/v4 v4.2.0 h1:HN5dHm3WBOgndBH6E8V0q2jIYIR3s9yglV8k/+MN3u4=
github.com/cenkalti/backoff/v4 v4.2.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=