JSON sanitizer and tests (#4925)
We've found that it's possible to get malformed JSON back from the graph server and graph SDK. This results in errors while trying to deserialize objects that are malformed JSON This adds a simple JSON sanitizer and tests to make sure that even if graph and the graph SDK happen to generate malformed JSON we can get back to a state where we can deserialize it again. The tests also print info when the received content differs from the original This PR does not change any of the logic that corso uses during backups or restores, it just adds sanitization code and tests --- #### Does this PR need a docs update or release note? - [ ] ✅ Yes, it's included - [ ] 🕐 Yes, but in a later PR - [x] ⛔ No #### Type of change - [ ] 🌻 Feature - [x] 🐛 Bugfix - [ ] 🗺️ Documentation - [x] 🤖 Supportability/Tests - [ ] 💻 CI/Deployment - [ ] 🧹 Tech Debt/Cleanup #### Test Plan - [x] 💪 Manual - [x] ⚡ Unit test - [ ] 💚 E2E
This commit is contained in:
parent
f5f65f0cfe
commit
87e41b20e0
58
src/internal/common/sanitize/json.go
Normal file
58
src/internal/common/sanitize/json.go
Normal file
@ -0,0 +1,58 @@
|
||||
package sanitize
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
// JSONString takes a []byte containing JSON as input and returns a []byte
|
||||
// containing the same content but with any character codes < 0x20 that weren't
|
||||
// escaped in the original escaped properly.
|
||||
func JSONBytes(input []byte) []byte {
|
||||
if len(input) == 0 {
|
||||
return input
|
||||
}
|
||||
|
||||
// Avoid most reallocations by just getting a buffer of the right size to
|
||||
// start with.
|
||||
// TODO(ashmrtn): We may actually want to overshoot this a little so we won't
|
||||
// cause a reallocation and possible doubling in size if we only need to
|
||||
// escape a few characters.
|
||||
buf := bytes.Buffer{}
|
||||
buf.Grow(len(input))
|
||||
|
||||
for _, c := range input {
|
||||
switch {
|
||||
case c == '\n' || c == '\t' || c == '\r':
|
||||
// Whitespace characters also seem to be getting transformed inside JSON
|
||||
// strings already. We shouldn't further transform them because they could
|
||||
// just be formatting around the JSON fields so changing them will result
|
||||
// in invalid JSON.
|
||||
//
|
||||
// The set of whitespace characters was taken from RFC 8259 although space
|
||||
// is not included in this set as it's already > 0x20.
|
||||
buf.WriteByte(c)
|
||||
|
||||
case c < 0x20:
|
||||
// Escape character ranges taken from RFC 8259. This case doesn't handle
|
||||
// escape characters (0x5c) or double quotes (0x22). We're assuming escape
|
||||
// characters don't require additional processing and that double quotes
|
||||
// are properly escaped by whatever handed us the JSON.
|
||||
//
|
||||
// We need to escape the character and transform it (e.x. linefeed -> \n).
|
||||
// We could use transforms like linefeed to \n, but it's actually easier,
|
||||
// if a little less space efficient, to just turn them into
|
||||
// multi-character sequences denoting a unicode character.
|
||||
buf.WriteString(fmt.Sprintf(`\u%04X`, c))
|
||||
|
||||
default:
|
||||
buf.WriteByte(c)
|
||||
}
|
||||
}
|
||||
|
||||
// Return a copy just so we don't hold a reference to internal bytes.Buffer
|
||||
// data.
|
||||
return slices.Clone(buf.Bytes())
|
||||
}
|
||||
154
src/internal/common/sanitize/json_test.go
Normal file
154
src/internal/common/sanitize/json_test.go
Normal file
@ -0,0 +1,154 @@
|
||||
package sanitize_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/suite"
|
||||
|
||||
"github.com/alcionai/corso/src/internal/common/sanitize"
|
||||
"github.com/alcionai/corso/src/internal/tester"
|
||||
)
|
||||
|
||||
type SanitizeJSONUnitSuite struct {
|
||||
tester.Suite
|
||||
}
|
||||
|
||||
func TestSanitizeJSONUnitSuite(t *testing.T) {
|
||||
suite.Run(t, &SanitizeJSONUnitSuite{Suite: tester.NewUnitSuite(t)})
|
||||
}
|
||||
|
||||
type jsonTest struct {
|
||||
name string
|
||||
input []byte
|
||||
expect []byte
|
||||
expectValid assert.BoolAssertionFunc
|
||||
}
|
||||
|
||||
func generateCharacterTests() []jsonTest {
|
||||
var (
|
||||
res []jsonTest
|
||||
|
||||
baseTestName = "Escape0x%02X"
|
||||
baseTestData = `{"foo":"ba%sr"}`
|
||||
// The current implementation tranforms most characters to the encoding
|
||||
// \u0000 where the 4 0's are hex digits. This is according to the JSON spec
|
||||
// in RFC 8259 section 7.
|
||||
expect = `{"foo":"ba%s\u00%02Xr"}`
|
||||
)
|
||||
|
||||
for i := 0; i < 0x20; i++ {
|
||||
// Whitespace characters are tested with manually written tests because they
|
||||
// have different handling.
|
||||
if i == '\n' || i == '\t' || i == '\r' {
|
||||
continue
|
||||
}
|
||||
|
||||
res = append(
|
||||
res,
|
||||
jsonTest{
|
||||
name: fmt.Sprintf(baseTestName, i),
|
||||
input: []byte(fmt.Sprintf(baseTestData, string(rune(i)))),
|
||||
expect: []byte(fmt.Sprintf(expect, "", string(rune(i)))),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
jsonTest{
|
||||
name: fmt.Sprintf(baseTestName, i) + " WithEscapedEscape",
|
||||
input: []byte(fmt.Sprintf(baseTestData, `\\`+string(rune(i)))),
|
||||
expect: []byte(fmt.Sprintf(expect, `\\`, string(rune(i)))),
|
||||
expectValid: assert.True,
|
||||
})
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (suite *SanitizeJSONUnitSuite) TestJSONBytes() {
|
||||
table := []jsonTest{
|
||||
{
|
||||
name: "AlreadyValid NoSpecialCharacters",
|
||||
input: []byte(`{"foo":"bar"}`),
|
||||
expect: []byte(`{"foo":"bar"}`),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "AlreadyValid EscapedTab",
|
||||
input: []byte("{\"foo\":\"ba\\tr\\\"\"}"),
|
||||
expect: []byte("{\"foo\":\"ba\\tr\\\"\"}"),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "AlreadyValid TabOutsideString",
|
||||
input: []byte("{\"foo\":\t\"bar\\\"\"}"),
|
||||
expect: []byte("{\"foo\":\t\"bar\\\"\"}"),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "AlreadyValid EscapedLinefeed",
|
||||
input: []byte("{\"foo\":\"ba\\nr\\\"\"}"),
|
||||
expect: []byte("{\"foo\":\"ba\\nr\\\"\"}"),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "AlreadyValid LinefeedOutsideString",
|
||||
input: []byte("{\"foo\":\n\"bar\\\"\"}"),
|
||||
expect: []byte("{\"foo\":\n\"bar\\\"\"}"),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "AlreadyValid EscapedCarriageReturn",
|
||||
input: []byte("{\"foo\":\"ba\\rr\\\"\"}"),
|
||||
expect: []byte("{\"foo\":\"ba\\rr\\\"\"}"),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "AlreadyValid CarriageReturnOutsideString",
|
||||
input: []byte("{\"foo\":\r\"bar\\\"\"}"),
|
||||
expect: []byte("{\"foo\":\r\"bar\\\"\"}"),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "AlreadyValid UnicodeSequence",
|
||||
input: []byte(`{"foo":"ba\u0008r\""}`),
|
||||
expect: []byte(`{"foo":"ba\u0008r\""}`),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "AlreadyValid SpecialCharacters",
|
||||
input: []byte(`{"foo":"ba\\r\""}`),
|
||||
expect: []byte(`{"foo":"ba\\r\""}`),
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "LF characters in JSON outside quotes",
|
||||
input: []byte(`{
|
||||
"content":"\n>> ` + "\b\bW" + `"
|
||||
}`),
|
||||
expect: nil,
|
||||
expectValid: assert.True,
|
||||
},
|
||||
{
|
||||
name: "No LF characters in JSON",
|
||||
input: []byte(`{"content":"\n>> ` + "\b\bW" + `"}`),
|
||||
expect: nil,
|
||||
expectValid: assert.True,
|
||||
},
|
||||
}
|
||||
|
||||
allTests := append(generateCharacterTests(), table...)
|
||||
|
||||
for _, test := range allTests {
|
||||
suite.Run(test.name, func() {
|
||||
t := suite.T()
|
||||
|
||||
got := sanitize.JSONBytes(test.input)
|
||||
|
||||
if test.expect != nil {
|
||||
assert.Equal(t, test.expect, got)
|
||||
}
|
||||
test.expectValid(t, json.Valid(got))
|
||||
})
|
||||
}
|
||||
}
|
||||
63
src/internal/converters/eml/tools/main.go
Normal file
63
src/internal/converters/eml/tools/main.go
Normal file
@ -0,0 +1,63 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/alcionai/corso/src/internal/common/sanitize"
|
||||
"github.com/alcionai/corso/src/pkg/services/m365/api"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Open the JSON file
|
||||
jsonFile, err := os.Open("data.json")
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
defer jsonFile.Close()
|
||||
|
||||
// Read the file contents
|
||||
byteValue, _ := io.ReadAll(jsonFile)
|
||||
|
||||
fmt.Printf("Original %s\n", string(byteValue))
|
||||
|
||||
// Declare an empty interface for holding the JSON data
|
||||
var jsonData any
|
||||
|
||||
if !json.Valid(byteValue) {
|
||||
fmt.Println("INVALID JSON")
|
||||
}
|
||||
|
||||
_, err = api.BytesToMessageable(byteValue)
|
||||
if err != nil {
|
||||
fmt.Println("Error converting to messagable", err)
|
||||
}
|
||||
|
||||
// Unmarshal the byteValue into the jsonData interface
|
||||
err = json.Unmarshal(byteValue, &jsonData)
|
||||
if err != nil {
|
||||
fmt.Println("Error parsing JSON:", err)
|
||||
}
|
||||
|
||||
sanitizedValue := sanitize.JSONBytes(byteValue)
|
||||
|
||||
fmt.Printf("\nSanitized %s\n", string(sanitizedValue))
|
||||
|
||||
if !json.Valid(sanitizedValue) {
|
||||
fmt.Println("sanitizedValue INVALID JSON")
|
||||
}
|
||||
|
||||
_, err = api.BytesToMessageable(sanitizedValue)
|
||||
if err != nil {
|
||||
fmt.Println("sanitizedValue: Error converting to messagable", err)
|
||||
}
|
||||
|
||||
// Unmarshal the byteValue into the jsonData interface
|
||||
err = json.Unmarshal(sanitizedValue, &jsonData)
|
||||
if err != nil {
|
||||
fmt.Println("sanitizedValue: Error parsing JSON:", err)
|
||||
}
|
||||
}
|
||||
@ -1,6 +1,9 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -12,6 +15,7 @@ import (
|
||||
"github.com/stretchr/testify/suite"
|
||||
|
||||
"github.com/alcionai/corso/src/internal/common/ptr"
|
||||
"github.com/alcionai/corso/src/internal/common/sanitize"
|
||||
exchMock "github.com/alcionai/corso/src/internal/m365/service/exchange/mock"
|
||||
"github.com/alcionai/corso/src/internal/tester"
|
||||
"github.com/alcionai/corso/src/internal/tester/tconfig"
|
||||
@ -523,3 +527,270 @@ func (suite *MailAPIIntgSuite) TestMail_GetContainerByName_mocked() {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func sendItemWithBodyAndGetSerialized(
|
||||
t *testing.T,
|
||||
ctx context.Context, //revive:disable-line:context-as-argument
|
||||
msgs Mail,
|
||||
userID string,
|
||||
mailFolderID string,
|
||||
subject string,
|
||||
bodyContent string,
|
||||
contentType models.BodyType,
|
||||
) []byte {
|
||||
msg := models.NewMessage()
|
||||
msg.SetSubject(ptr.To(subject))
|
||||
|
||||
body := models.NewItemBody()
|
||||
body.SetContent(ptr.To(bodyContent))
|
||||
body.SetContentType(ptr.To(contentType))
|
||||
|
||||
msg.SetBody(body)
|
||||
|
||||
item, err := msgs.PostItem(ctx, userID, mailFolderID, msg)
|
||||
require.NoError(t, err, clues.ToCore(err))
|
||||
|
||||
fetched, _, err := msgs.GetItem(
|
||||
ctx,
|
||||
userID,
|
||||
ptr.Val(item.GetId()),
|
||||
false,
|
||||
fault.New(true))
|
||||
require.NoError(t, err, clues.ToCore(err))
|
||||
|
||||
serialized, err := msgs.Serialize(
|
||||
ctx,
|
||||
fetched,
|
||||
userID,
|
||||
ptr.Val(item.GetId()))
|
||||
require.NoError(t, err, clues.ToCore(err))
|
||||
|
||||
return serialized
|
||||
}
|
||||
|
||||
func sendSerializedItemAndGetSerialized(
|
||||
t *testing.T,
|
||||
ctx context.Context, //revive:disable-line:context-as-argument
|
||||
msgs Mail,
|
||||
userID string,
|
||||
mailFolderID string,
|
||||
serializedInput []byte,
|
||||
) []byte {
|
||||
msg, err := BytesToMessageable(serializedInput)
|
||||
require.NoError(t, err, clues.ToCore(err))
|
||||
|
||||
item, err := msgs.PostItem(ctx, userID, mailFolderID, msg)
|
||||
require.NoError(t, err, clues.ToCore(err))
|
||||
|
||||
fetched, _, err := msgs.GetItem(
|
||||
ctx,
|
||||
userID,
|
||||
ptr.Val(item.GetId()),
|
||||
false,
|
||||
fault.New(true))
|
||||
require.NoError(t, err, clues.ToCore(err))
|
||||
|
||||
serialized, err := msgs.Serialize(
|
||||
ctx,
|
||||
fetched,
|
||||
userID,
|
||||
ptr.Val(item.GetId()))
|
||||
require.NoError(t, err, clues.ToCore(err))
|
||||
|
||||
return serialized
|
||||
}
|
||||
|
||||
func (suite *MailAPIIntgSuite) TestMail_WithSpecialCharacters() {
|
||||
t := suite.T()
|
||||
|
||||
ctx, flush := tester.NewContext(t)
|
||||
defer flush()
|
||||
|
||||
contentRegex := regexp.MustCompile(`"content": ?"(.*?)"(?:, ?"contentType"|}}|},)`)
|
||||
|
||||
htmlStub := "<html><head>\r\n<meta http-equiv=\"Content-Type\" " +
|
||||
"content=\"text/html; charset=utf-8\"><style type=\"text/css\" " +
|
||||
"style=\"display:none\">\r\n<!--\r\np\r\n\t{margin-top:0;\r\n\t" +
|
||||
"margin-bottom:0}\r\n-->\r\n</style></head>" +
|
||||
"<body dir=\"ltr\"><div " +
|
||||
"class=\"elementToProof\" style=\"font-family:Aptos,Aptos_EmbeddedFont," +
|
||||
"Aptos_MSFontService,Calibri,Helvetica,sans-serif; font-size:12pt; " +
|
||||
"color:rgb(0,0,0)\">%s</div></body></html>"
|
||||
|
||||
userID := tconfig.M365UserID(suite.T())
|
||||
|
||||
folderName := testdata.DefaultRestoreConfig("EscapeCharacters").Location
|
||||
msgs := suite.its.ac.Mail()
|
||||
mailfolder, err := msgs.CreateContainer(ctx, userID, MsgFolderRoot, folderName)
|
||||
require.NoError(t, err, clues.ToCore(err))
|
||||
|
||||
escapeCharRanges := [][]int{
|
||||
{0x0, 0x20},
|
||||
{0x22, 0x23},
|
||||
{0x5c, 0x5d},
|
||||
}
|
||||
testSequences := []string{
|
||||
// Character code for backspace
|
||||
`\u0008`,
|
||||
`\\u0008`,
|
||||
"u0008",
|
||||
// Character code for \
|
||||
`\u005c`,
|
||||
`\\u005c`,
|
||||
"u005c",
|
||||
// Character code for "
|
||||
`\u0022`,
|
||||
`\\u0022`,
|
||||
"u0022",
|
||||
// Character code for B
|
||||
`\u0042`,
|
||||
`\\u0042`,
|
||||
"u0042",
|
||||
// G clef character (U+1D11E) (from JSON RFC).
|
||||
// TODO(ashmrtn): Uncomment this once the golang graph SDK is fixed. Right
|
||||
// now it can't properly send these code points.
|
||||
//`\uD834\uDD1E`,
|
||||
"\\n",
|
||||
"\\\n",
|
||||
"abcdef\b\b",
|
||||
"n" + string(rune(0)),
|
||||
"n" + string(rune(0)) + "n",
|
||||
}
|
||||
|
||||
table := []struct {
|
||||
name string
|
||||
contentTmpl string
|
||||
contentType models.BodyType
|
||||
}{
|
||||
{
|
||||
name: "PlainText",
|
||||
contentTmpl: "%s",
|
||||
contentType: models.TEXT_BODYTYPE,
|
||||
},
|
||||
{
|
||||
name: "HTML",
|
||||
contentTmpl: htmlStub,
|
||||
contentType: models.HTML_BODYTYPE,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range table {
|
||||
suite.Run(test.name, func() {
|
||||
t := suite.T()
|
||||
|
||||
for _, charRange := range escapeCharRanges {
|
||||
for i := charRange[0]; i < charRange[1]; i++ {
|
||||
subject := fmt.Sprintf("character %x", i)
|
||||
|
||||
bodyContent := fmt.Sprintf(test.contentTmpl, string(rune(i)))
|
||||
|
||||
serialized := sendItemWithBodyAndGetSerialized(
|
||||
t,
|
||||
ctx,
|
||||
msgs,
|
||||
userID,
|
||||
ptr.Val(mailfolder.GetId()),
|
||||
subject,
|
||||
bodyContent,
|
||||
test.contentType)
|
||||
|
||||
matches := contentRegex.FindAllSubmatch(serialized, -1)
|
||||
|
||||
switch {
|
||||
case len(matches) == 0:
|
||||
t.Logf("character 0x%x wasn't found", i)
|
||||
|
||||
case len(matches[0]) < 2:
|
||||
t.Logf("character 0x%x was removed", i)
|
||||
|
||||
case bodyContent != string(matches[0][1]):
|
||||
t.Logf("character 0x%x has been transformed to %s", i, matches[0][1])
|
||||
}
|
||||
|
||||
sanitized := sanitize.JSONBytes(serialized)
|
||||
newSerialized := sendSerializedItemAndGetSerialized(
|
||||
t,
|
||||
ctx,
|
||||
msgs,
|
||||
userID,
|
||||
ptr.Val(mailfolder.GetId()),
|
||||
sanitized)
|
||||
|
||||
newMatches := contentRegex.FindAllSubmatch(newSerialized, -1)
|
||||
|
||||
switch {
|
||||
case len(newMatches) == 0:
|
||||
t.Logf("sanitized character 0x%x wasn't found", i)
|
||||
|
||||
case len(newMatches[0]) < 2:
|
||||
t.Logf("sanitized character 0x%x was removed", i)
|
||||
|
||||
case bodyContent != string(newMatches[0][1]):
|
||||
t.Logf(
|
||||
"sanitized character 0x%x has been transformed to %s",
|
||||
i,
|
||||
newMatches[0][1])
|
||||
}
|
||||
|
||||
assert.Equal(t, matches[0][1], newMatches[0][1])
|
||||
}
|
||||
}
|
||||
|
||||
for i, sequence := range testSequences {
|
||||
subject := fmt.Sprintf("sequence %d", i)
|
||||
|
||||
bodyContent := fmt.Sprintf(test.contentTmpl, sequence)
|
||||
|
||||
serialized := sendItemWithBodyAndGetSerialized(
|
||||
t,
|
||||
ctx,
|
||||
msgs,
|
||||
userID,
|
||||
ptr.Val(mailfolder.GetId()),
|
||||
subject,
|
||||
bodyContent,
|
||||
test.contentType)
|
||||
|
||||
matches := contentRegex.FindAllSubmatch(serialized, -1)
|
||||
|
||||
switch {
|
||||
case len(matches) == 0:
|
||||
t.Logf("sequence %d wasn't found", i)
|
||||
|
||||
case len(matches[0]) < 2:
|
||||
t.Logf("sequence %d was removed", i)
|
||||
|
||||
case sequence != string(matches[0][1]):
|
||||
t.Logf("sequence %d has been transformed to %s", i, matches[0][1])
|
||||
}
|
||||
|
||||
sanitized := sanitize.JSONBytes(serialized)
|
||||
newSerialized := sendSerializedItemAndGetSerialized(
|
||||
t,
|
||||
ctx,
|
||||
msgs,
|
||||
userID,
|
||||
ptr.Val(mailfolder.GetId()),
|
||||
sanitized)
|
||||
|
||||
newMatches := contentRegex.FindAllSubmatch(newSerialized, -1)
|
||||
|
||||
switch {
|
||||
case len(newMatches) == 0:
|
||||
t.Logf("sanitized sequence %d wasn't found", i)
|
||||
|
||||
case len(newMatches[0]) < 2:
|
||||
t.Logf("sanitized sequence %d was removed", i)
|
||||
|
||||
case sequence != string(newMatches[0][1]):
|
||||
t.Logf(
|
||||
"sanitized sequence %d has been transformed to %s",
|
||||
i,
|
||||
newMatches[0][1])
|
||||
}
|
||||
|
||||
assert.Equal(t, matches[0][1], newMatches[0][1])
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user