-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: siddarth.msv <82795818+Sidddddarth@users.noreply.github.com> Co-authored-by: Akash Chetty <achetty.iitr@gmail.com> Co-authored-by: Leonidas Vrachnis <leo.al.vra@gmail.com> Co-authored-by: devops-github-rudderstack <88187154+devops-github-rudderstack@users.noreply.github.com>
- Loading branch information
1 parent
d07af42
commit 0ccb5aa
Showing
2 changed files
with
103 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package utf8 | ||
|
||
import ( | ||
"unicode/utf8" | ||
) | ||
|
||
var replacementCharByte = byte(63) // 63 -> '?' | ||
|
||
// Sanitize detects invalid UTF-8 byte sequences and replaces them with the replacement character. | ||
// The slice length remains unchanged and no extra allocations are made. | ||
// This is obtained by modifying the input slice in place and by making sure that the replacement character is a | ||
// single-byte character. The side effect is that an invalid byte sequence is going to be replaced with multiple | ||
// occurrences of the replacement characters e.g. "\xE0\x80\xAF" -> "???". | ||
func Sanitize(data []byte) { | ||
for i := 0; i < len(data); { | ||
r, size := utf8.DecodeRune(data[i:]) | ||
if r == utf8.RuneError && size == 1 { | ||
// Replace the invalid byte with the replacement character | ||
data[i] = replacementCharByte | ||
} | ||
i += size | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
package utf8 | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
"unicode/utf8" | ||
|
||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestSanitize(t *testing.T) { | ||
var ( | ||
rb = replacementCharByte | ||
hello = []byte{72, 101, 108, 108, 111} // Hello | ||
world = []byte{228, 184, 150, 231, 149, 140} // 世界 | ||
invalid = []byte{0xff, 0xfe, 0xfd} | ||
) | ||
|
||
tests := []struct { | ||
name string | ||
valid bool | ||
input []byte | ||
expected []byte | ||
}{ | ||
{"valid", true, []byte("Hello 世界"), []byte("Hello 世界")}, | ||
{"invalid 1", false, invalid, []byte{rb, rb, rb}}, | ||
{"invalid 2", false, []byte{0xd4, 0x6d}, []byte{rb, 0x6d}}, // pq: invalid byte sequence for encoding "UTF8": 0xd4 0x6d | ||
{"mixed", false, append(hello, append(invalid, world...)...), append(hello, append([]byte{rb, rb, rb}, world...)...)}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
require.Equal(t, tt.valid, utf8.Valid(tt.input)) | ||
|
||
inputCopy := make([]byte, len(tt.input)) // Copy to avoid modifying the original input | ||
copy(inputCopy, tt.input) | ||
|
||
Sanitize(inputCopy) | ||
t.Logf("Sanitize(%s) = %s", tt.input, inputCopy) | ||
|
||
// After sanitization, the result should be valid | ||
require.True(t, utf8.Valid(inputCopy)) | ||
require.Equal(t, tt.expected, inputCopy) | ||
}) | ||
} | ||
} | ||
|
||
func TestSanitizeInOut(t *testing.T) { | ||
ch := func(n int) string { return strings.Repeat(string(replacementCharByte), n) } | ||
|
||
toValidUTF8Tests := []struct { | ||
in string | ||
out string | ||
}{ | ||
{"", ""}, | ||
{"abc", "abc"}, | ||
{"\uFDDD", "\uFDDD"}, | ||
{"a\xffb", "a" + ch(1) + "b"}, | ||
{"a\xffb\uFFFD", "a" + ch(1) + "b\uFFFD"}, | ||
{"a☺\xffb☺\xC0\xAFc☺\xff", "a☺" + ch(1) + "b☺" + ch(2) + "c☺" + ch(1)}, | ||
{"\xC0\xAF", ch(2)}, | ||
{"\xE0\x80\xAF", ch(3)}, | ||
{"\xed\xa0\x80", ch(3)}, | ||
{"\xed\xbf\xbf", ch(3)}, | ||
{"\xF0\x80\x80\xaf", ch(4)}, | ||
{"\xF8\x80\x80\x80\xAF", ch(5)}, | ||
{"\xFC\x80\x80\x80\x80\xAF", ch(6)}, | ||
} | ||
|
||
for _, tt := range toValidUTF8Tests { | ||
t.Run(tt.in, func(t *testing.T) { | ||
inputCopy := make([]byte, len(tt.in)) // Copy to avoid modifying the original input | ||
copy(inputCopy, tt.in) | ||
|
||
Sanitize(inputCopy) | ||
require.Equal(t, tt.out, string(inputCopy)) | ||
require.True(t, utf8.Valid(inputCopy)) | ||
}) | ||
} | ||
} |