Skip to content

Commit

Permalink
feat: utf8 sanitizer (#485)
Browse files Browse the repository at this point in the history
Co-authored-by: siddarth.msv <82795818+Sidddddarth@users.noreply.github.com>
Co-authored-by: Akash Chetty <achetty.iitr@gmail.com>
Co-authored-by: Leonidas Vrachnis <leo.al.vra@gmail.com>
Co-authored-by: devops-github-rudderstack <88187154+devops-github-rudderstack@users.noreply.github.com>
  • Loading branch information
5 people committed May 29, 2024
1 parent d07af42 commit 0ccb5aa
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 0 deletions.
23 changes: 23 additions & 0 deletions utf8/sanitize.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package utf8

import (
"unicode/utf8"
)

var replacementCharByte = byte(63) // 63 -> '?'

// Sanitize detects invalid UTF-8 byte sequences and replaces them with the replacement character.
// The slice length remains unchanged and no extra allocations are made.
// This is obtained by modifying the input slice in place and by making sure that the replacement character is a
// single-byte character. The side effect is that an invalid byte sequence is going to be replaced with multiple
// occurrences of the replacement characters e.g. "\xE0\x80\xAF" -> "???".
func Sanitize(data []byte) {
for i := 0; i < len(data); {
r, size := utf8.DecodeRune(data[i:])
if r == utf8.RuneError && size == 1 {
// Replace the invalid byte with the replacement character
data[i] = replacementCharByte
}
i += size
}
}
80 changes: 80 additions & 0 deletions utf8/sanitize_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package utf8

import (
"strings"
"testing"
"unicode/utf8"

"github.com/stretchr/testify/require"
)

func TestSanitize(t *testing.T) {
var (
rb = replacementCharByte
hello = []byte{72, 101, 108, 108, 111} // Hello
world = []byte{228, 184, 150, 231, 149, 140} // 世界
invalid = []byte{0xff, 0xfe, 0xfd}
)

tests := []struct {
name string
valid bool
input []byte
expected []byte
}{
{"valid", true, []byte("Hello 世界"), []byte("Hello 世界")},
{"invalid 1", false, invalid, []byte{rb, rb, rb}},
{"invalid 2", false, []byte{0xd4, 0x6d}, []byte{rb, 0x6d}}, // pq: invalid byte sequence for encoding "UTF8": 0xd4 0x6d
{"mixed", false, append(hello, append(invalid, world...)...), append(hello, append([]byte{rb, rb, rb}, world...)...)},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
require.Equal(t, tt.valid, utf8.Valid(tt.input))

inputCopy := make([]byte, len(tt.input)) // Copy to avoid modifying the original input
copy(inputCopy, tt.input)

Sanitize(inputCopy)
t.Logf("Sanitize(%s) = %s", tt.input, inputCopy)

// After sanitization, the result should be valid
require.True(t, utf8.Valid(inputCopy))
require.Equal(t, tt.expected, inputCopy)
})
}
}

func TestSanitizeInOut(t *testing.T) {
ch := func(n int) string { return strings.Repeat(string(replacementCharByte), n) }

toValidUTF8Tests := []struct {
in string
out string
}{
{"", ""},
{"abc", "abc"},
{"\uFDDD", "\uFDDD"},
{"a\xffb", "a" + ch(1) + "b"},
{"a\xffb\uFFFD", "a" + ch(1) + "b\uFFFD"},
{"a☺\xffb☺\xC0\xAFc☺\xff", "a☺" + ch(1) + "b☺" + ch(2) + "c☺" + ch(1)},
{"\xC0\xAF", ch(2)},
{"\xE0\x80\xAF", ch(3)},
{"\xed\xa0\x80", ch(3)},
{"\xed\xbf\xbf", ch(3)},
{"\xF0\x80\x80\xaf", ch(4)},
{"\xF8\x80\x80\x80\xAF", ch(5)},
{"\xFC\x80\x80\x80\x80\xAF", ch(6)},
}

for _, tt := range toValidUTF8Tests {
t.Run(tt.in, func(t *testing.T) {
inputCopy := make([]byte, len(tt.in)) // Copy to avoid modifying the original input
copy(inputCopy, tt.in)

Sanitize(inputCopy)
require.Equal(t, tt.out, string(inputCopy))
require.True(t, utf8.Valid(inputCopy))
})
}
}

0 comments on commit 0ccb5aa

Please sign in to comment.