feat(gnovm): add unicode/utf16 in stdlibs

Signed-off-by: Norman Meier <norman@berty.tech>
gnolang · Aug 24, 2023 · ed06097 · ed06097
1 parent a612e57
commit ed06097
Show file tree

Hide file tree

Showing 3 changed files with 327 additions and 0 deletions.
diff --git a/gnovm/stdlibs/unicode/utf16/export_test.gno b/gnovm/stdlibs/unicode/utf16/export_test.gno
@@ -0,0 +1,11 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf16
+
+// Extra names for constants so we can validate them during testing.
+const (
+	MaxRune         = maxRune
+	ReplacementChar = replacementChar
+)
diff --git a/gnovm/stdlibs/unicode/utf16/utf16.gno b/gnovm/stdlibs/unicode/utf16/utf16.gno
@@ -0,0 +1,108 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package utf16 implements encoding and decoding of UTF-16 sequences.
+package utf16
+
+// The conditions replacementChar==unicode.ReplacementChar and
+// maxRune==unicode.MaxRune are verified in the tests.
+// Defining them locally avoids this package depending on package unicode.
+
+const (
+	replacementChar = '\uFFFD'     // Unicode replacement character
+	maxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
+)
+
+const (
+	// 0xd800-0xdc00 encodes the high 10 bits of a pair.
+	// 0xdc00-0xe000 encodes the low 10 bits of a pair.
+	// the value is those 20 bits plus 0x10000.
+	surr1 = 0xd800
+	surr2 = 0xdc00
+	surr3 = 0xe000
+
+	surrSelf = 0x10000
+)
+
+// IsSurrogate reports whether the specified Unicode code point
+// can appear in a surrogate pair.
+func IsSurrogate(r rune) bool {
+	return surr1 <= r && r < surr3
+}
+
+// DecodeRune returns the UTF-16 decoding of a surrogate pair.
+// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
+// the Unicode replacement code point U+FFFD.
+func DecodeRune(r1, r2 rune) rune {
+	if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
+		return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
+	}
+	return replacementChar
+}
+
+// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
+// If the rune is not a valid Unicode code point or does not need encoding,
+// EncodeRune returns U+FFFD, U+FFFD.
+func EncodeRune(r rune) (r1, r2 rune) {
+	if r < surrSelf || r > maxRune {
+		return replacementChar, replacementChar
+	}
+	r -= surrSelf
+	return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
+}
+
+// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
+func Encode(s []rune) []uint16 {
+	n := len(s)
+	for _, v := range s {
+		if v >= surrSelf {
+			n++
+		}
+	}
+
+	a := make([]uint16, n)
+	n = 0
+	for _, v := range s {
+		switch {
+		case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
+			// normal rune
+			a[n] = uint16(v)
+			n++
+		case surrSelf <= v && v <= maxRune:
+			// needs surrogate sequence
+			r1, r2 := EncodeRune(v)
+			a[n] = uint16(r1)
+			a[n+1] = uint16(r2)
+			n += 2
+		default:
+			a[n] = uint16(replacementChar)
+			n++
+		}
+	}
+	return a[:n]
+}
+
+// Decode returns the Unicode code point sequence represented
+// by the UTF-16 encoding s.
+func Decode(s []uint16) []rune {
+	a := make([]rune, len(s))
+	n := 0
+	for i := 0; i < len(s); i++ {
+		switch r := s[i]; {
+		case r < surr1, surr3 <= r:
+			// normal rune
+			a[n] = rune(r)
+		case surr1 <= r && r < surr2 && i+1 < len(s) &&
+			surr2 <= s[i+1] && s[i+1] < surr3:
+			// valid surrogate sequence
+			a[n] = DecodeRune(rune(r), rune(s[i+1]))
+			i++
+		default:
+			// invalid surrogate sequence
+			a[n] = replacementChar
+		}
+		n++
+	}
+	return a[:n]
+}
diff --git a/gnovm/stdlibs/unicode/utf16/utf16_test.gno b/gnovm/stdlibs/unicode/utf16/utf16_test.gno
@@ -0,0 +1,208 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf16_test
+
+import (
+	"reflect"
+	"testing"
+	"unicode"
+	. "unicode/utf16"
+)
+
+// Validate the constants redefined from unicode.
+func TestConstants(t *testing.T) {
+	if MaxRune != unicode.MaxRune {
+		t.Errorf("utf16.maxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
+	}
+	if ReplacementChar != unicode.ReplacementChar {
+		t.Errorf("utf16.replacementChar is wrong: %x should be %x", ReplacementChar, unicode.ReplacementChar)
+	}
+}
+
+type encodeTest struct {
+	in  []rune
+	out []uint16
+}
+
+var encodeTests = []encodeTest{
+	{[]rune{1, 2, 3, 4}, []uint16{1, 2, 3, 4}},
+	{
+		[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff},
+		[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff},
+	},
+	{
+		[]rune{'a', 'b', 0xd7ff, 0xd800, 0xdfff, 0xe000, 0x110000, -1},
+		[]uint16{'a', 'b', 0xd7ff, 0xfffd, 0xfffd, 0xe000, 0xfffd, 0xfffd},
+	},
+}
+
+func TestEncode(t *testing.T) {
+	for _, tt := range encodeTests {
+		out := Encode(tt.in)
+		if !reflect.DeepEqual(out, tt.out) {
+			t.Errorf("Encode(%x) = %x; want %x", tt.in, out, tt.out)
+		}
+	}
+}
+
+func TestEncodeRune(t *testing.T) {
+	for i, tt := range encodeTests {
+		j := 0
+		for _, r := range tt.in {
+			r1, r2 := EncodeRune(r)
+			if r < 0x10000 || r > unicode.MaxRune {
+				if j >= len(tt.out) {
+					t.Errorf("#%d: ran out of tt.out", i)
+					break
+				}
+				if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar {
+					t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2)
+				}
+				j++
+			} else {
+				if j+1 >= len(tt.out) {
+					t.Errorf("#%d: ran out of tt.out", i)
+					break
+				}
+				if r1 != rune(tt.out[j]) || r2 != rune(tt.out[j+1]) {
+					t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1])
+				}
+				j += 2
+				dec := DecodeRune(r1, r2)
+				if dec != r {
+					t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r)
+				}
+			}
+		}
+		if j != len(tt.out) {
+			t.Errorf("#%d: EncodeRune didn't generate enough output", i)
+		}
+	}
+}
+
+type decodeTest struct {
+	in  []uint16
+	out []rune
+}
+
+var decodeTests = []decodeTest{
+	{[]uint16{1, 2, 3, 4}, []rune{1, 2, 3, 4}},
+	{
+		[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff},
+		[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff},
+	},
+	{[]uint16{0xd800, 'a'}, []rune{0xfffd, 'a'}},
+	{[]uint16{0xdfff}, []rune{0xfffd}},
+}
+
+func TestDecode(t *testing.T) {
+	for _, tt := range decodeTests {
+		out := Decode(tt.in)
+		if !reflect.DeepEqual(out, tt.out) {
+			t.Errorf("Decode(%x) = %x; want %x", tt.in, out, tt.out)
+		}
+	}
+}
+
+var decodeRuneTests = []struct {
+	r1, r2 rune
+	want   rune
+}{
+	{0xd800, 0xdc00, 0x10000},
+	{0xd800, 0xdc01, 0x10001},
+	{0xd808, 0xdf45, 0x12345},
+	{0xdbff, 0xdfff, 0x10ffff},
+	{0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted
+}
+
+func TestDecodeRune(t *testing.T) {
+	for i, tt := range decodeRuneTests {
+		got := DecodeRune(tt.r1, tt.r2)
+		if got != tt.want {
+			t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want)
+		}
+	}
+}
+
+var surrogateTests = []struct {
+	r    rune
+	want bool
+}{
+	// from https://en.wikipedia.org/wiki/UTF-16
+	{'\u007A', false},     // LATIN SMALL LETTER Z
+	{'\u6C34', false},     // CJK UNIFIED IDEOGRAPH-6C34 (water)
+	{'\uFEFF', false},     // Byte Order Mark
+	{'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point)
+	{'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF
+	{'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point)
+
+	{rune(0xd7ff), false}, // surr1-1
+	{rune(0xd800), true},  // surr1
+	{rune(0xdc00), true},  // surr2
+	{rune(0xe000), false}, // surr3
+	{rune(0xdfff), true},  // surr3-1
+}
+
+func TestIsSurrogate(t *testing.T) {
+	for i, tt := range surrogateTests {
+		got := IsSurrogate(tt.r)
+		if got != tt.want {
+			t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want)
+		}
+	}
+}
+
+func BenchmarkDecodeValidASCII(b *testing.B) {
+	// "hello world"
+	data := []uint16{104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100}
+	for i := 0; i < b.N; i++ {
+		Decode(data)
+	}
+}
+
+func BenchmarkDecodeValidJapaneseChars(b *testing.B) {
+	// "日本語日本語日本語"
+	data := []uint16{26085, 26412, 35486, 26085, 26412, 35486, 26085, 26412, 35486}
+	for i := 0; i < b.N; i++ {
+		Decode(data)
+	}
+}
+
+func BenchmarkDecodeRune(b *testing.B) {
+	rs := make([]rune, 10)
+	// U+1D4D0 to U+1D4D4: MATHEMATICAL BOLD SCRIPT CAPITAL LETTERS
+	for i, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
+		rs[2*i], rs[2*i+1] = EncodeRune(u)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < 5; j++ {
+			DecodeRune(rs[2*j], rs[2*j+1])
+		}
+	}
+}
+
+func BenchmarkEncodeValidASCII(b *testing.B) {
+	data := []rune{'h', 'e', 'l', 'l', 'o'}
+	for i := 0; i < b.N; i++ {
+		Encode(data)
+	}
+}
+
+func BenchmarkEncodeValidJapaneseChars(b *testing.B) {
+	data := []rune{'日', '本', '語'}
+	for i := 0; i < b.N; i++ {
+		Encode(data)
+	}
+}
+
+func BenchmarkEncodeRune(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		for _, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
+			EncodeRune(u)
+		}
+	}
+}