feat: utf8 sanitizer (#485)

Co-authored-by: siddarth.msv <82795818+Sidddddarth@users.noreply.github.com> Co-authored-by: Akash Chetty <achetty.iitr@gmail.com> Co-authored-by: Leonidas Vrachnis <leo.al.vra@gmail.com> Co-authored-by: devops-github-rudderstack <88187154+devops-github-rudderstack@users.noreply.github.com>
rudderlabs · May 29, 2024 · 0ccb5aa · 0ccb5aa
1 parent d07af42
commit 0ccb5aa
Show file tree

Hide file tree

Showing 2 changed files with 103 additions and 0 deletions.
diff --git a/utf8/sanitize.go b/utf8/sanitize.go
@@ -0,0 +1,23 @@
+package utf8
+
+import (
+	"unicode/utf8"
+)
+
+var replacementCharByte = byte(63) // 63 -> '?'
+
+// Sanitize detects invalid UTF-8 byte sequences and replaces them with the replacement character.
+// The slice length remains unchanged and no extra allocations are made.
+// This is obtained by modifying the input slice in place and by making sure that the replacement character is a
+// single-byte character. The side effect is that an invalid byte sequence is going to be replaced with multiple
+// occurrences of the replacement characters e.g. "\xE0\x80\xAF" -> "???".
+func Sanitize(data []byte) {
+	for i := 0; i < len(data); {
+		r, size := utf8.DecodeRune(data[i:])
+		if r == utf8.RuneError && size == 1 {
+			// Replace the invalid byte with the replacement character
+			data[i] = replacementCharByte
+		}
+		i += size
+	}
+}
diff --git a/utf8/sanitize_test.go b/utf8/sanitize_test.go
@@ -0,0 +1,80 @@
+package utf8
+
+import (
+	"strings"
+	"testing"
+	"unicode/utf8"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestSanitize(t *testing.T) {
+	var (
+		rb      = replacementCharByte
+		hello   = []byte{72, 101, 108, 108, 111}       // Hello
+		world   = []byte{228, 184, 150, 231, 149, 140} // 世界
+		invalid = []byte{0xff, 0xfe, 0xfd}
+	)
+
+	tests := []struct {
+		name     string
+		valid    bool
+		input    []byte
+		expected []byte
+	}{
+		{"valid", true, []byte("Hello 世界"), []byte("Hello 世界")},
+		{"invalid 1", false, invalid, []byte{rb, rb, rb}},
+		{"invalid 2", false, []byte{0xd4, 0x6d}, []byte{rb, 0x6d}}, // pq: invalid byte sequence for encoding "UTF8": 0xd4 0x6d
+		{"mixed", false, append(hello, append(invalid, world...)...), append(hello, append([]byte{rb, rb, rb}, world...)...)},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			require.Equal(t, tt.valid, utf8.Valid(tt.input))
+
+			inputCopy := make([]byte, len(tt.input)) // Copy to avoid modifying the original input
+			copy(inputCopy, tt.input)
+
+			Sanitize(inputCopy)
+			t.Logf("Sanitize(%s) = %s", tt.input, inputCopy)
+
+			// After sanitization, the result should be valid
+			require.True(t, utf8.Valid(inputCopy))
+			require.Equal(t, tt.expected, inputCopy)
+		})
+	}
+}
+
+func TestSanitizeInOut(t *testing.T) {
+	ch := func(n int) string { return strings.Repeat(string(replacementCharByte), n) }
+
+	toValidUTF8Tests := []struct {
+		in  string
+		out string
+	}{
+		{"", ""},
+		{"abc", "abc"},
+		{"\uFDDD", "\uFDDD"},
+		{"a\xffb", "a" + ch(1) + "b"},
+		{"a\xffb\uFFFD", "a" + ch(1) + "b\uFFFD"},
+		{"a☺\xffb☺\xC0\xAFc☺\xff", "a☺" + ch(1) + "b☺" + ch(2) + "c☺" + ch(1)},
+		{"\xC0\xAF", ch(2)},
+		{"\xE0\x80\xAF", ch(3)},
+		{"\xed\xa0\x80", ch(3)},
+		{"\xed\xbf\xbf", ch(3)},
+		{"\xF0\x80\x80\xaf", ch(4)},
+		{"\xF8\x80\x80\x80\xAF", ch(5)},
+		{"\xFC\x80\x80\x80\x80\xAF", ch(6)},
+	}
+
+	for _, tt := range toValidUTF8Tests {
+		t.Run(tt.in, func(t *testing.T) {
+			inputCopy := make([]byte, len(tt.in)) // Copy to avoid modifying the original input
+			copy(inputCopy, tt.in)
+
+			Sanitize(inputCopy)
+			require.Equal(t, tt.out, string(inputCopy))
+			require.True(t, utf8.Valid(inputCopy))
+		})
+	}
+}