diff --git a/src/internal/common/sanitize/json.go b/src/internal/common/sanitize/json.go new file mode 100644 index 000000000..e2eddb98d --- /dev/null +++ b/src/internal/common/sanitize/json.go @@ -0,0 +1,58 @@ +package sanitize + +import ( + "bytes" + "fmt" + + "golang.org/x/exp/slices" +) + +// JSONString takes a []byte containing JSON as input and returns a []byte +// containing the same content but with any character codes < 0x20 that weren't +// escaped in the original escaped properly. +func JSONBytes(input []byte) []byte { + if len(input) == 0 { + return input + } + + // Avoid most reallocations by just getting a buffer of the right size to + // start with. + // TODO(ashmrtn): We may actually want to overshoot this a little so we won't + // cause a reallocation and possible doubling in size if we only need to + // escape a few characters. + buf := bytes.Buffer{} + buf.Grow(len(input)) + + for _, c := range input { + switch { + case c == '\n' || c == '\t' || c == '\r': + // Whitespace characters also seem to be getting transformed inside JSON + // strings already. We shouldn't further transform them because they could + // just be formatting around the JSON fields so changing them will result + // in invalid JSON. + // + // The set of whitespace characters was taken from RFC 8259 although space + // is not included in this set as it's already > 0x20. + buf.WriteByte(c) + + case c < 0x20: + // Escape character ranges taken from RFC 8259. This case doesn't handle + // escape characters (0x5c) or double quotes (0x22). We're assuming escape + // characters don't require additional processing and that double quotes + // are properly escaped by whatever handed us the JSON. + // + // We need to escape the character and transform it (e.x. linefeed -> \n). + // We could use transforms like linefeed to \n, but it's actually easier, + // if a little less space efficient, to just turn them into + // multi-character sequences denoting a unicode character. + buf.WriteString(fmt.Sprintf(`\u%04X`, c)) + + default: + buf.WriteByte(c) + } + } + + // Return a copy just so we don't hold a reference to internal bytes.Buffer + // data. + return slices.Clone(buf.Bytes()) +} diff --git a/src/internal/common/sanitize/json_test.go b/src/internal/common/sanitize/json_test.go new file mode 100644 index 000000000..9e4bb96ba --- /dev/null +++ b/src/internal/common/sanitize/json_test.go @@ -0,0 +1,154 @@ +package sanitize_test + +import ( + "encoding/json" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/suite" + + "github.com/alcionai/corso/src/internal/common/sanitize" + "github.com/alcionai/corso/src/internal/tester" +) + +type SanitizeJSONUnitSuite struct { + tester.Suite +} + +func TestSanitizeJSONUnitSuite(t *testing.T) { + suite.Run(t, &SanitizeJSONUnitSuite{Suite: tester.NewUnitSuite(t)}) +} + +type jsonTest struct { + name string + input []byte + expect []byte + expectValid assert.BoolAssertionFunc +} + +func generateCharacterTests() []jsonTest { + var ( + res []jsonTest + + baseTestName = "Escape0x%02X" + baseTestData = `{"foo":"ba%sr"}` + // The current implementation tranforms most characters to the encoding + // \u0000 where the 4 0's are hex digits. This is according to the JSON spec + // in RFC 8259 section 7. + expect = `{"foo":"ba%s\u00%02Xr"}` + ) + + for i := 0; i < 0x20; i++ { + // Whitespace characters are tested with manually written tests because they + // have different handling. + if i == '\n' || i == '\t' || i == '\r' { + continue + } + + res = append( + res, + jsonTest{ + name: fmt.Sprintf(baseTestName, i), + input: []byte(fmt.Sprintf(baseTestData, string(rune(i)))), + expect: []byte(fmt.Sprintf(expect, "", string(rune(i)))), + expectValid: assert.True, + }, + jsonTest{ + name: fmt.Sprintf(baseTestName, i) + " WithEscapedEscape", + input: []byte(fmt.Sprintf(baseTestData, `\\`+string(rune(i)))), + expect: []byte(fmt.Sprintf(expect, `\\`, string(rune(i)))), + expectValid: assert.True, + }) + } + + return res +} + +func (suite *SanitizeJSONUnitSuite) TestJSONBytes() { + table := []jsonTest{ + { + name: "AlreadyValid NoSpecialCharacters", + input: []byte(`{"foo":"bar"}`), + expect: []byte(`{"foo":"bar"}`), + expectValid: assert.True, + }, + { + name: "AlreadyValid EscapedTab", + input: []byte("{\"foo\":\"ba\\tr\\\"\"}"), + expect: []byte("{\"foo\":\"ba\\tr\\\"\"}"), + expectValid: assert.True, + }, + { + name: "AlreadyValid TabOutsideString", + input: []byte("{\"foo\":\t\"bar\\\"\"}"), + expect: []byte("{\"foo\":\t\"bar\\\"\"}"), + expectValid: assert.True, + }, + { + name: "AlreadyValid EscapedLinefeed", + input: []byte("{\"foo\":\"ba\\nr\\\"\"}"), + expect: []byte("{\"foo\":\"ba\\nr\\\"\"}"), + expectValid: assert.True, + }, + { + name: "AlreadyValid LinefeedOutsideString", + input: []byte("{\"foo\":\n\"bar\\\"\"}"), + expect: []byte("{\"foo\":\n\"bar\\\"\"}"), + expectValid: assert.True, + }, + { + name: "AlreadyValid EscapedCarriageReturn", + input: []byte("{\"foo\":\"ba\\rr\\\"\"}"), + expect: []byte("{\"foo\":\"ba\\rr\\\"\"}"), + expectValid: assert.True, + }, + { + name: "AlreadyValid CarriageReturnOutsideString", + input: []byte("{\"foo\":\r\"bar\\\"\"}"), + expect: []byte("{\"foo\":\r\"bar\\\"\"}"), + expectValid: assert.True, + }, + { + name: "AlreadyValid UnicodeSequence", + input: []byte(`{"foo":"ba\u0008r\""}`), + expect: []byte(`{"foo":"ba\u0008r\""}`), + expectValid: assert.True, + }, + { + name: "AlreadyValid SpecialCharacters", + input: []byte(`{"foo":"ba\\r\""}`), + expect: []byte(`{"foo":"ba\\r\""}`), + expectValid: assert.True, + }, + { + name: "LF characters in JSON outside quotes", + input: []byte(`{ + "content":"\n>> ` + "\b\bW" + `" + }`), + expect: nil, + expectValid: assert.True, + }, + { + name: "No LF characters in JSON", + input: []byte(`{"content":"\n>> ` + "\b\bW" + `"}`), + expect: nil, + expectValid: assert.True, + }, + } + + allTests := append(generateCharacterTests(), table...) + + for _, test := range allTests { + suite.Run(test.name, func() { + t := suite.T() + + got := sanitize.JSONBytes(test.input) + + if test.expect != nil { + assert.Equal(t, test.expect, got) + } + test.expectValid(t, json.Valid(got)) + }) + } +} diff --git a/src/internal/converters/eml/tools/main.go b/src/internal/converters/eml/tools/main.go new file mode 100644 index 000000000..a0f126847 --- /dev/null +++ b/src/internal/converters/eml/tools/main.go @@ -0,0 +1,63 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "os" + + "github.com/alcionai/corso/src/internal/common/sanitize" + "github.com/alcionai/corso/src/pkg/services/m365/api" +) + +func main() { + // Open the JSON file + jsonFile, err := os.Open("data.json") + if err != nil { + fmt.Println(err) + return + } + defer jsonFile.Close() + + // Read the file contents + byteValue, _ := io.ReadAll(jsonFile) + + fmt.Printf("Original %s\n", string(byteValue)) + + // Declare an empty interface for holding the JSON data + var jsonData any + + if !json.Valid(byteValue) { + fmt.Println("INVALID JSON") + } + + _, err = api.BytesToMessageable(byteValue) + if err != nil { + fmt.Println("Error converting to messagable", err) + } + + // Unmarshal the byteValue into the jsonData interface + err = json.Unmarshal(byteValue, &jsonData) + if err != nil { + fmt.Println("Error parsing JSON:", err) + } + + sanitizedValue := sanitize.JSONBytes(byteValue) + + fmt.Printf("\nSanitized %s\n", string(sanitizedValue)) + + if !json.Valid(sanitizedValue) { + fmt.Println("sanitizedValue INVALID JSON") + } + + _, err = api.BytesToMessageable(sanitizedValue) + if err != nil { + fmt.Println("sanitizedValue: Error converting to messagable", err) + } + + // Unmarshal the byteValue into the jsonData interface + err = json.Unmarshal(sanitizedValue, &jsonData) + if err != nil { + fmt.Println("sanitizedValue: Error parsing JSON:", err) + } +} diff --git a/src/pkg/services/m365/api/mail_test.go b/src/pkg/services/m365/api/mail_test.go index 073991f58..d7f7dd226 100644 --- a/src/pkg/services/m365/api/mail_test.go +++ b/src/pkg/services/m365/api/mail_test.go @@ -1,6 +1,9 @@ package api import ( + "context" + "fmt" + "regexp" "testing" "time" @@ -12,6 +15,7 @@ import ( "github.com/stretchr/testify/suite" "github.com/alcionai/corso/src/internal/common/ptr" + "github.com/alcionai/corso/src/internal/common/sanitize" exchMock "github.com/alcionai/corso/src/internal/m365/service/exchange/mock" "github.com/alcionai/corso/src/internal/tester" "github.com/alcionai/corso/src/internal/tester/tconfig" @@ -523,3 +527,270 @@ func (suite *MailAPIIntgSuite) TestMail_GetContainerByName_mocked() { }) } } + +func sendItemWithBodyAndGetSerialized( + t *testing.T, + ctx context.Context, //revive:disable-line:context-as-argument + msgs Mail, + userID string, + mailFolderID string, + subject string, + bodyContent string, + contentType models.BodyType, +) []byte { + msg := models.NewMessage() + msg.SetSubject(ptr.To(subject)) + + body := models.NewItemBody() + body.SetContent(ptr.To(bodyContent)) + body.SetContentType(ptr.To(contentType)) + + msg.SetBody(body) + + item, err := msgs.PostItem(ctx, userID, mailFolderID, msg) + require.NoError(t, err, clues.ToCore(err)) + + fetched, _, err := msgs.GetItem( + ctx, + userID, + ptr.Val(item.GetId()), + false, + fault.New(true)) + require.NoError(t, err, clues.ToCore(err)) + + serialized, err := msgs.Serialize( + ctx, + fetched, + userID, + ptr.Val(item.GetId())) + require.NoError(t, err, clues.ToCore(err)) + + return serialized +} + +func sendSerializedItemAndGetSerialized( + t *testing.T, + ctx context.Context, //revive:disable-line:context-as-argument + msgs Mail, + userID string, + mailFolderID string, + serializedInput []byte, +) []byte { + msg, err := BytesToMessageable(serializedInput) + require.NoError(t, err, clues.ToCore(err)) + + item, err := msgs.PostItem(ctx, userID, mailFolderID, msg) + require.NoError(t, err, clues.ToCore(err)) + + fetched, _, err := msgs.GetItem( + ctx, + userID, + ptr.Val(item.GetId()), + false, + fault.New(true)) + require.NoError(t, err, clues.ToCore(err)) + + serialized, err := msgs.Serialize( + ctx, + fetched, + userID, + ptr.Val(item.GetId())) + require.NoError(t, err, clues.ToCore(err)) + + return serialized +} + +func (suite *MailAPIIntgSuite) TestMail_WithSpecialCharacters() { + t := suite.T() + + ctx, flush := tester.NewContext(t) + defer flush() + + contentRegex := regexp.MustCompile(`"content": ?"(.*?)"(?:, ?"contentType"|}}|},)`) + + htmlStub := "\r\n" + + "
%s
" + + userID := tconfig.M365UserID(suite.T()) + + folderName := testdata.DefaultRestoreConfig("EscapeCharacters").Location + msgs := suite.its.ac.Mail() + mailfolder, err := msgs.CreateContainer(ctx, userID, MsgFolderRoot, folderName) + require.NoError(t, err, clues.ToCore(err)) + + escapeCharRanges := [][]int{ + {0x0, 0x20}, + {0x22, 0x23}, + {0x5c, 0x5d}, + } + testSequences := []string{ + // Character code for backspace + `\u0008`, + `\\u0008`, + "u0008", + // Character code for \ + `\u005c`, + `\\u005c`, + "u005c", + // Character code for " + `\u0022`, + `\\u0022`, + "u0022", + // Character code for B + `\u0042`, + `\\u0042`, + "u0042", + // G clef character (U+1D11E) (from JSON RFC). + // TODO(ashmrtn): Uncomment this once the golang graph SDK is fixed. Right + // now it can't properly send these code points. + //`\uD834\uDD1E`, + "\\n", + "\\\n", + "abcdef\b\b", + "n" + string(rune(0)), + "n" + string(rune(0)) + "n", + } + + table := []struct { + name string + contentTmpl string + contentType models.BodyType + }{ + { + name: "PlainText", + contentTmpl: "%s", + contentType: models.TEXT_BODYTYPE, + }, + { + name: "HTML", + contentTmpl: htmlStub, + contentType: models.HTML_BODYTYPE, + }, + } + + for _, test := range table { + suite.Run(test.name, func() { + t := suite.T() + + for _, charRange := range escapeCharRanges { + for i := charRange[0]; i < charRange[1]; i++ { + subject := fmt.Sprintf("character %x", i) + + bodyContent := fmt.Sprintf(test.contentTmpl, string(rune(i))) + + serialized := sendItemWithBodyAndGetSerialized( + t, + ctx, + msgs, + userID, + ptr.Val(mailfolder.GetId()), + subject, + bodyContent, + test.contentType) + + matches := contentRegex.FindAllSubmatch(serialized, -1) + + switch { + case len(matches) == 0: + t.Logf("character 0x%x wasn't found", i) + + case len(matches[0]) < 2: + t.Logf("character 0x%x was removed", i) + + case bodyContent != string(matches[0][1]): + t.Logf("character 0x%x has been transformed to %s", i, matches[0][1]) + } + + sanitized := sanitize.JSONBytes(serialized) + newSerialized := sendSerializedItemAndGetSerialized( + t, + ctx, + msgs, + userID, + ptr.Val(mailfolder.GetId()), + sanitized) + + newMatches := contentRegex.FindAllSubmatch(newSerialized, -1) + + switch { + case len(newMatches) == 0: + t.Logf("sanitized character 0x%x wasn't found", i) + + case len(newMatches[0]) < 2: + t.Logf("sanitized character 0x%x was removed", i) + + case bodyContent != string(newMatches[0][1]): + t.Logf( + "sanitized character 0x%x has been transformed to %s", + i, + newMatches[0][1]) + } + + assert.Equal(t, matches[0][1], newMatches[0][1]) + } + } + + for i, sequence := range testSequences { + subject := fmt.Sprintf("sequence %d", i) + + bodyContent := fmt.Sprintf(test.contentTmpl, sequence) + + serialized := sendItemWithBodyAndGetSerialized( + t, + ctx, + msgs, + userID, + ptr.Val(mailfolder.GetId()), + subject, + bodyContent, + test.contentType) + + matches := contentRegex.FindAllSubmatch(serialized, -1) + + switch { + case len(matches) == 0: + t.Logf("sequence %d wasn't found", i) + + case len(matches[0]) < 2: + t.Logf("sequence %d was removed", i) + + case sequence != string(matches[0][1]): + t.Logf("sequence %d has been transformed to %s", i, matches[0][1]) + } + + sanitized := sanitize.JSONBytes(serialized) + newSerialized := sendSerializedItemAndGetSerialized( + t, + ctx, + msgs, + userID, + ptr.Val(mailfolder.GetId()), + sanitized) + + newMatches := contentRegex.FindAllSubmatch(newSerialized, -1) + + switch { + case len(newMatches) == 0: + t.Logf("sanitized sequence %d wasn't found", i) + + case len(newMatches[0]) < 2: + t.Logf("sanitized sequence %d was removed", i) + + case sequence != string(newMatches[0][1]): + t.Logf( + "sanitized sequence %d has been transformed to %s", + i, + newMatches[0][1]) + } + + assert.Equal(t, matches[0][1], newMatches[0][1]) + } + }) + } +}