JSON sanitizer and tests (#4925)

We've found that it's possible to get malformed JSON back from
the graph server and graph SDK. This results in errors while trying
to deserialize objects that are malformed JSON

This adds a simple JSON sanitizer and tests to make sure that even if
graph and the graph SDK happen to generate malformed JSON we can get
back to a state where we can deserialize it again. The tests also
print info when the received content differs from the original

This PR does not change any of the logic that corso uses during
backups or restores, it just adds sanitization code and tests

---

#### Does this PR need a docs update or release note?

- [ ]  Yes, it's included
- [ ] 🕐 Yes, but in a later PR
- [x]  No

#### Type of change

- [ ] 🌻 Feature
- [x] 🐛 Bugfix
- [ ] 🗺️ Documentation
- [x] 🤖 Supportability/Tests
- [ ] 💻 CI/Deployment
- [ ] 🧹 Tech Debt/Cleanup

#### Test Plan

- [x] 💪 Manual
- [x]  Unit test
- [ ] 💚 E2E
This commit is contained in:
ashmrtn 2023-12-22 21:24:00 -08:00 committed by GitHub
parent f5f65f0cfe
commit 87e41b20e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 546 additions and 0 deletions

View File

@ -0,0 +1,58 @@
package sanitize
import (
"bytes"
"fmt"
"golang.org/x/exp/slices"
)
// JSONString takes a []byte containing JSON as input and returns a []byte
// containing the same content but with any character codes < 0x20 that weren't
// escaped in the original escaped properly.
func JSONBytes(input []byte) []byte {
if len(input) == 0 {
return input
}
// Avoid most reallocations by just getting a buffer of the right size to
// start with.
// TODO(ashmrtn): We may actually want to overshoot this a little so we won't
// cause a reallocation and possible doubling in size if we only need to
// escape a few characters.
buf := bytes.Buffer{}
buf.Grow(len(input))
for _, c := range input {
switch {
case c == '\n' || c == '\t' || c == '\r':
// Whitespace characters also seem to be getting transformed inside JSON
// strings already. We shouldn't further transform them because they could
// just be formatting around the JSON fields so changing them will result
// in invalid JSON.
//
// The set of whitespace characters was taken from RFC 8259 although space
// is not included in this set as it's already > 0x20.
buf.WriteByte(c)
case c < 0x20:
// Escape character ranges taken from RFC 8259. This case doesn't handle
// escape characters (0x5c) or double quotes (0x22). We're assuming escape
// characters don't require additional processing and that double quotes
// are properly escaped by whatever handed us the JSON.
//
// We need to escape the character and transform it (e.x. linefeed -> \n).
// We could use transforms like linefeed to \n, but it's actually easier,
// if a little less space efficient, to just turn them into
// multi-character sequences denoting a unicode character.
buf.WriteString(fmt.Sprintf(`\u%04X`, c))
default:
buf.WriteByte(c)
}
}
// Return a copy just so we don't hold a reference to internal bytes.Buffer
// data.
return slices.Clone(buf.Bytes())
}

View File

@ -0,0 +1,154 @@
package sanitize_test
import (
"encoding/json"
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
"github.com/alcionai/corso/src/internal/common/sanitize"
"github.com/alcionai/corso/src/internal/tester"
)
type SanitizeJSONUnitSuite struct {
tester.Suite
}
func TestSanitizeJSONUnitSuite(t *testing.T) {
suite.Run(t, &SanitizeJSONUnitSuite{Suite: tester.NewUnitSuite(t)})
}
type jsonTest struct {
name string
input []byte
expect []byte
expectValid assert.BoolAssertionFunc
}
func generateCharacterTests() []jsonTest {
var (
res []jsonTest
baseTestName = "Escape0x%02X"
baseTestData = `{"foo":"ba%sr"}`
// The current implementation tranforms most characters to the encoding
// \u0000 where the 4 0's are hex digits. This is according to the JSON spec
// in RFC 8259 section 7.
expect = `{"foo":"ba%s\u00%02Xr"}`
)
for i := 0; i < 0x20; i++ {
// Whitespace characters are tested with manually written tests because they
// have different handling.
if i == '\n' || i == '\t' || i == '\r' {
continue
}
res = append(
res,
jsonTest{
name: fmt.Sprintf(baseTestName, i),
input: []byte(fmt.Sprintf(baseTestData, string(rune(i)))),
expect: []byte(fmt.Sprintf(expect, "", string(rune(i)))),
expectValid: assert.True,
},
jsonTest{
name: fmt.Sprintf(baseTestName, i) + " WithEscapedEscape",
input: []byte(fmt.Sprintf(baseTestData, `\\`+string(rune(i)))),
expect: []byte(fmt.Sprintf(expect, `\\`, string(rune(i)))),
expectValid: assert.True,
})
}
return res
}
func (suite *SanitizeJSONUnitSuite) TestJSONBytes() {
table := []jsonTest{
{
name: "AlreadyValid NoSpecialCharacters",
input: []byte(`{"foo":"bar"}`),
expect: []byte(`{"foo":"bar"}`),
expectValid: assert.True,
},
{
name: "AlreadyValid EscapedTab",
input: []byte("{\"foo\":\"ba\\tr\\\"\"}"),
expect: []byte("{\"foo\":\"ba\\tr\\\"\"}"),
expectValid: assert.True,
},
{
name: "AlreadyValid TabOutsideString",
input: []byte("{\"foo\":\t\"bar\\\"\"}"),
expect: []byte("{\"foo\":\t\"bar\\\"\"}"),
expectValid: assert.True,
},
{
name: "AlreadyValid EscapedLinefeed",
input: []byte("{\"foo\":\"ba\\nr\\\"\"}"),
expect: []byte("{\"foo\":\"ba\\nr\\\"\"}"),
expectValid: assert.True,
},
{
name: "AlreadyValid LinefeedOutsideString",
input: []byte("{\"foo\":\n\"bar\\\"\"}"),
expect: []byte("{\"foo\":\n\"bar\\\"\"}"),
expectValid: assert.True,
},
{
name: "AlreadyValid EscapedCarriageReturn",
input: []byte("{\"foo\":\"ba\\rr\\\"\"}"),
expect: []byte("{\"foo\":\"ba\\rr\\\"\"}"),
expectValid: assert.True,
},
{
name: "AlreadyValid CarriageReturnOutsideString",
input: []byte("{\"foo\":\r\"bar\\\"\"}"),
expect: []byte("{\"foo\":\r\"bar\\\"\"}"),
expectValid: assert.True,
},
{
name: "AlreadyValid UnicodeSequence",
input: []byte(`{"foo":"ba\u0008r\""}`),
expect: []byte(`{"foo":"ba\u0008r\""}`),
expectValid: assert.True,
},
{
name: "AlreadyValid SpecialCharacters",
input: []byte(`{"foo":"ba\\r\""}`),
expect: []byte(`{"foo":"ba\\r\""}`),
expectValid: assert.True,
},
{
name: "LF characters in JSON outside quotes",
input: []byte(`{
"content":"\n>> ` + "\b\bW" + `"
}`),
expect: nil,
expectValid: assert.True,
},
{
name: "No LF characters in JSON",
input: []byte(`{"content":"\n>> ` + "\b\bW" + `"}`),
expect: nil,
expectValid: assert.True,
},
}
allTests := append(generateCharacterTests(), table...)
for _, test := range allTests {
suite.Run(test.name, func() {
t := suite.T()
got := sanitize.JSONBytes(test.input)
if test.expect != nil {
assert.Equal(t, test.expect, got)
}
test.expectValid(t, json.Valid(got))
})
}
}

View File

@ -0,0 +1,63 @@
package main
import (
"encoding/json"
"fmt"
"io"
"os"
"github.com/alcionai/corso/src/internal/common/sanitize"
"github.com/alcionai/corso/src/pkg/services/m365/api"
)
func main() {
// Open the JSON file
jsonFile, err := os.Open("data.json")
if err != nil {
fmt.Println(err)
return
}
defer jsonFile.Close()
// Read the file contents
byteValue, _ := io.ReadAll(jsonFile)
fmt.Printf("Original %s\n", string(byteValue))
// Declare an empty interface for holding the JSON data
var jsonData any
if !json.Valid(byteValue) {
fmt.Println("INVALID JSON")
}
_, err = api.BytesToMessageable(byteValue)
if err != nil {
fmt.Println("Error converting to messagable", err)
}
// Unmarshal the byteValue into the jsonData interface
err = json.Unmarshal(byteValue, &jsonData)
if err != nil {
fmt.Println("Error parsing JSON:", err)
}
sanitizedValue := sanitize.JSONBytes(byteValue)
fmt.Printf("\nSanitized %s\n", string(sanitizedValue))
if !json.Valid(sanitizedValue) {
fmt.Println("sanitizedValue INVALID JSON")
}
_, err = api.BytesToMessageable(sanitizedValue)
if err != nil {
fmt.Println("sanitizedValue: Error converting to messagable", err)
}
// Unmarshal the byteValue into the jsonData interface
err = json.Unmarshal(sanitizedValue, &jsonData)
if err != nil {
fmt.Println("sanitizedValue: Error parsing JSON:", err)
}
}

View File

@ -1,6 +1,9 @@
package api
import (
"context"
"fmt"
"regexp"
"testing"
"time"
@ -12,6 +15,7 @@ import (
"github.com/stretchr/testify/suite"
"github.com/alcionai/corso/src/internal/common/ptr"
"github.com/alcionai/corso/src/internal/common/sanitize"
exchMock "github.com/alcionai/corso/src/internal/m365/service/exchange/mock"
"github.com/alcionai/corso/src/internal/tester"
"github.com/alcionai/corso/src/internal/tester/tconfig"
@ -523,3 +527,270 @@ func (suite *MailAPIIntgSuite) TestMail_GetContainerByName_mocked() {
})
}
}
func sendItemWithBodyAndGetSerialized(
t *testing.T,
ctx context.Context, //revive:disable-line:context-as-argument
msgs Mail,
userID string,
mailFolderID string,
subject string,
bodyContent string,
contentType models.BodyType,
) []byte {
msg := models.NewMessage()
msg.SetSubject(ptr.To(subject))
body := models.NewItemBody()
body.SetContent(ptr.To(bodyContent))
body.SetContentType(ptr.To(contentType))
msg.SetBody(body)
item, err := msgs.PostItem(ctx, userID, mailFolderID, msg)
require.NoError(t, err, clues.ToCore(err))
fetched, _, err := msgs.GetItem(
ctx,
userID,
ptr.Val(item.GetId()),
false,
fault.New(true))
require.NoError(t, err, clues.ToCore(err))
serialized, err := msgs.Serialize(
ctx,
fetched,
userID,
ptr.Val(item.GetId()))
require.NoError(t, err, clues.ToCore(err))
return serialized
}
func sendSerializedItemAndGetSerialized(
t *testing.T,
ctx context.Context, //revive:disable-line:context-as-argument
msgs Mail,
userID string,
mailFolderID string,
serializedInput []byte,
) []byte {
msg, err := BytesToMessageable(serializedInput)
require.NoError(t, err, clues.ToCore(err))
item, err := msgs.PostItem(ctx, userID, mailFolderID, msg)
require.NoError(t, err, clues.ToCore(err))
fetched, _, err := msgs.GetItem(
ctx,
userID,
ptr.Val(item.GetId()),
false,
fault.New(true))
require.NoError(t, err, clues.ToCore(err))
serialized, err := msgs.Serialize(
ctx,
fetched,
userID,
ptr.Val(item.GetId()))
require.NoError(t, err, clues.ToCore(err))
return serialized
}
func (suite *MailAPIIntgSuite) TestMail_WithSpecialCharacters() {
t := suite.T()
ctx, flush := tester.NewContext(t)
defer flush()
contentRegex := regexp.MustCompile(`"content": ?"(.*?)"(?:, ?"contentType"|}}|},)`)
htmlStub := "<html><head>\r\n<meta http-equiv=\"Content-Type\" " +
"content=\"text/html; charset=utf-8\"><style type=\"text/css\" " +
"style=\"display:none\">\r\n<!--\r\np\r\n\t{margin-top:0;\r\n\t" +
"margin-bottom:0}\r\n-->\r\n</style></head>" +
"<body dir=\"ltr\"><div " +
"class=\"elementToProof\" style=\"font-family:Aptos,Aptos_EmbeddedFont," +
"Aptos_MSFontService,Calibri,Helvetica,sans-serif; font-size:12pt; " +
"color:rgb(0,0,0)\">%s</div></body></html>"
userID := tconfig.M365UserID(suite.T())
folderName := testdata.DefaultRestoreConfig("EscapeCharacters").Location
msgs := suite.its.ac.Mail()
mailfolder, err := msgs.CreateContainer(ctx, userID, MsgFolderRoot, folderName)
require.NoError(t, err, clues.ToCore(err))
escapeCharRanges := [][]int{
{0x0, 0x20},
{0x22, 0x23},
{0x5c, 0x5d},
}
testSequences := []string{
// Character code for backspace
`\u0008`,
`\\u0008`,
"u0008",
// Character code for \
`\u005c`,
`\\u005c`,
"u005c",
// Character code for "
`\u0022`,
`\\u0022`,
"u0022",
// Character code for B
`\u0042`,
`\\u0042`,
"u0042",
// G clef character (U+1D11E) (from JSON RFC).
// TODO(ashmrtn): Uncomment this once the golang graph SDK is fixed. Right
// now it can't properly send these code points.
//`\uD834\uDD1E`,
"\\n",
"\\\n",
"abcdef\b\b",
"n" + string(rune(0)),
"n" + string(rune(0)) + "n",
}
table := []struct {
name string
contentTmpl string
contentType models.BodyType
}{
{
name: "PlainText",
contentTmpl: "%s",
contentType: models.TEXT_BODYTYPE,
},
{
name: "HTML",
contentTmpl: htmlStub,
contentType: models.HTML_BODYTYPE,
},
}
for _, test := range table {
suite.Run(test.name, func() {
t := suite.T()
for _, charRange := range escapeCharRanges {
for i := charRange[0]; i < charRange[1]; i++ {
subject := fmt.Sprintf("character %x", i)
bodyContent := fmt.Sprintf(test.contentTmpl, string(rune(i)))
serialized := sendItemWithBodyAndGetSerialized(
t,
ctx,
msgs,
userID,
ptr.Val(mailfolder.GetId()),
subject,
bodyContent,
test.contentType)
matches := contentRegex.FindAllSubmatch(serialized, -1)
switch {
case len(matches) == 0:
t.Logf("character 0x%x wasn't found", i)
case len(matches[0]) < 2:
t.Logf("character 0x%x was removed", i)
case bodyContent != string(matches[0][1]):
t.Logf("character 0x%x has been transformed to %s", i, matches[0][1])
}
sanitized := sanitize.JSONBytes(serialized)
newSerialized := sendSerializedItemAndGetSerialized(
t,
ctx,
msgs,
userID,
ptr.Val(mailfolder.GetId()),
sanitized)
newMatches := contentRegex.FindAllSubmatch(newSerialized, -1)
switch {
case len(newMatches) == 0:
t.Logf("sanitized character 0x%x wasn't found", i)
case len(newMatches[0]) < 2:
t.Logf("sanitized character 0x%x was removed", i)
case bodyContent != string(newMatches[0][1]):
t.Logf(
"sanitized character 0x%x has been transformed to %s",
i,
newMatches[0][1])
}
assert.Equal(t, matches[0][1], newMatches[0][1])
}
}
for i, sequence := range testSequences {
subject := fmt.Sprintf("sequence %d", i)
bodyContent := fmt.Sprintf(test.contentTmpl, sequence)
serialized := sendItemWithBodyAndGetSerialized(
t,
ctx,
msgs,
userID,
ptr.Val(mailfolder.GetId()),
subject,
bodyContent,
test.contentType)
matches := contentRegex.FindAllSubmatch(serialized, -1)
switch {
case len(matches) == 0:
t.Logf("sequence %d wasn't found", i)
case len(matches[0]) < 2:
t.Logf("sequence %d was removed", i)
case sequence != string(matches[0][1]):
t.Logf("sequence %d has been transformed to %s", i, matches[0][1])
}
sanitized := sanitize.JSONBytes(serialized)
newSerialized := sendSerializedItemAndGetSerialized(
t,
ctx,
msgs,
userID,
ptr.Val(mailfolder.GetId()),
sanitized)
newMatches := contentRegex.FindAllSubmatch(newSerialized, -1)
switch {
case len(newMatches) == 0:
t.Logf("sanitized sequence %d wasn't found", i)
case len(newMatches[0]) < 2:
t.Logf("sanitized sequence %d was removed", i)
case sequence != string(newMatches[0][1]):
t.Logf(
"sanitized sequence %d has been transformed to %s",
i,
newMatches[0][1])
}
assert.Equal(t, matches[0][1], newMatches[0][1])
}
})
}
}