-
Notifications
You must be signed in to change notification settings - Fork 375
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(gnovm): add unicode/utf16 in stdlibs
Signed-off-by: Norman Meier <norman@berty.tech>
- Loading branch information
Showing
3 changed files
with
327 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
// Copyright 2012 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package utf16 | ||
|
||
// Extra names for constants so we can validate them during testing. | ||
const ( | ||
MaxRune = maxRune | ||
ReplacementChar = replacementChar | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
// Copyright 2010 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
// Package utf16 implements encoding and decoding of UTF-16 sequences. | ||
package utf16 | ||
|
||
// The conditions replacementChar==unicode.ReplacementChar and | ||
// maxRune==unicode.MaxRune are verified in the tests. | ||
// Defining them locally avoids this package depending on package unicode. | ||
|
||
const ( | ||
replacementChar = '\uFFFD' // Unicode replacement character | ||
maxRune = '\U0010FFFF' // Maximum valid Unicode code point. | ||
) | ||
|
||
const ( | ||
// 0xd800-0xdc00 encodes the high 10 bits of a pair. | ||
// 0xdc00-0xe000 encodes the low 10 bits of a pair. | ||
// the value is those 20 bits plus 0x10000. | ||
surr1 = 0xd800 | ||
surr2 = 0xdc00 | ||
surr3 = 0xe000 | ||
|
||
surrSelf = 0x10000 | ||
) | ||
|
||
// IsSurrogate reports whether the specified Unicode code point | ||
// can appear in a surrogate pair. | ||
func IsSurrogate(r rune) bool { | ||
return surr1 <= r && r < surr3 | ||
} | ||
|
||
// DecodeRune returns the UTF-16 decoding of a surrogate pair. | ||
// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns | ||
// the Unicode replacement code point U+FFFD. | ||
func DecodeRune(r1, r2 rune) rune { | ||
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { | ||
return (r1-surr1)<<10 | (r2 - surr2) + surrSelf | ||
} | ||
return replacementChar | ||
} | ||
|
||
// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune. | ||
// If the rune is not a valid Unicode code point or does not need encoding, | ||
// EncodeRune returns U+FFFD, U+FFFD. | ||
func EncodeRune(r rune) (r1, r2 rune) { | ||
if r < surrSelf || r > maxRune { | ||
return replacementChar, replacementChar | ||
} | ||
r -= surrSelf | ||
return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff | ||
} | ||
|
||
// Encode returns the UTF-16 encoding of the Unicode code point sequence s. | ||
func Encode(s []rune) []uint16 { | ||
n := len(s) | ||
for _, v := range s { | ||
if v >= surrSelf { | ||
n++ | ||
} | ||
} | ||
|
||
a := make([]uint16, n) | ||
n = 0 | ||
for _, v := range s { | ||
switch { | ||
case 0 <= v && v < surr1, surr3 <= v && v < surrSelf: | ||
// normal rune | ||
a[n] = uint16(v) | ||
n++ | ||
case surrSelf <= v && v <= maxRune: | ||
// needs surrogate sequence | ||
r1, r2 := EncodeRune(v) | ||
a[n] = uint16(r1) | ||
a[n+1] = uint16(r2) | ||
n += 2 | ||
default: | ||
a[n] = uint16(replacementChar) | ||
n++ | ||
} | ||
} | ||
return a[:n] | ||
} | ||
|
||
// Decode returns the Unicode code point sequence represented | ||
// by the UTF-16 encoding s. | ||
func Decode(s []uint16) []rune { | ||
a := make([]rune, len(s)) | ||
n := 0 | ||
for i := 0; i < len(s); i++ { | ||
switch r := s[i]; { | ||
case r < surr1, surr3 <= r: | ||
// normal rune | ||
a[n] = rune(r) | ||
case surr1 <= r && r < surr2 && i+1 < len(s) && | ||
surr2 <= s[i+1] && s[i+1] < surr3: | ||
// valid surrogate sequence | ||
a[n] = DecodeRune(rune(r), rune(s[i+1])) | ||
i++ | ||
default: | ||
// invalid surrogate sequence | ||
a[n] = replacementChar | ||
} | ||
n++ | ||
} | ||
return a[:n] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
// Copyright 2010 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package utf16_test | ||
|
||
import ( | ||
"reflect" | ||
"testing" | ||
"unicode" | ||
. "unicode/utf16" | ||
) | ||
|
||
// Validate the constants redefined from unicode. | ||
func TestConstants(t *testing.T) { | ||
if MaxRune != unicode.MaxRune { | ||
t.Errorf("utf16.maxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune) | ||
} | ||
if ReplacementChar != unicode.ReplacementChar { | ||
t.Errorf("utf16.replacementChar is wrong: %x should be %x", ReplacementChar, unicode.ReplacementChar) | ||
} | ||
} | ||
|
||
type encodeTest struct { | ||
in []rune | ||
out []uint16 | ||
} | ||
|
||
var encodeTests = []encodeTest{ | ||
{[]rune{1, 2, 3, 4}, []uint16{1, 2, 3, 4}}, | ||
{ | ||
[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff}, | ||
[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff}, | ||
}, | ||
{ | ||
[]rune{'a', 'b', 0xd7ff, 0xd800, 0xdfff, 0xe000, 0x110000, -1}, | ||
[]uint16{'a', 'b', 0xd7ff, 0xfffd, 0xfffd, 0xe000, 0xfffd, 0xfffd}, | ||
}, | ||
} | ||
|
||
func TestEncode(t *testing.T) { | ||
for _, tt := range encodeTests { | ||
out := Encode(tt.in) | ||
if !reflect.DeepEqual(out, tt.out) { | ||
t.Errorf("Encode(%x) = %x; want %x", tt.in, out, tt.out) | ||
} | ||
} | ||
} | ||
|
||
func TestEncodeRune(t *testing.T) { | ||
for i, tt := range encodeTests { | ||
j := 0 | ||
for _, r := range tt.in { | ||
r1, r2 := EncodeRune(r) | ||
if r < 0x10000 || r > unicode.MaxRune { | ||
if j >= len(tt.out) { | ||
t.Errorf("#%d: ran out of tt.out", i) | ||
break | ||
} | ||
if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar { | ||
t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2) | ||
} | ||
j++ | ||
} else { | ||
if j+1 >= len(tt.out) { | ||
t.Errorf("#%d: ran out of tt.out", i) | ||
break | ||
} | ||
if r1 != rune(tt.out[j]) || r2 != rune(tt.out[j+1]) { | ||
t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1]) | ||
} | ||
j += 2 | ||
dec := DecodeRune(r1, r2) | ||
if dec != r { | ||
t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r) | ||
} | ||
} | ||
} | ||
if j != len(tt.out) { | ||
t.Errorf("#%d: EncodeRune didn't generate enough output", i) | ||
} | ||
} | ||
} | ||
|
||
type decodeTest struct { | ||
in []uint16 | ||
out []rune | ||
} | ||
|
||
var decodeTests = []decodeTest{ | ||
{[]uint16{1, 2, 3, 4}, []rune{1, 2, 3, 4}}, | ||
{ | ||
[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff}, | ||
[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff}, | ||
}, | ||
{[]uint16{0xd800, 'a'}, []rune{0xfffd, 'a'}}, | ||
{[]uint16{0xdfff}, []rune{0xfffd}}, | ||
} | ||
|
||
func TestDecode(t *testing.T) { | ||
for _, tt := range decodeTests { | ||
out := Decode(tt.in) | ||
if !reflect.DeepEqual(out, tt.out) { | ||
t.Errorf("Decode(%x) = %x; want %x", tt.in, out, tt.out) | ||
} | ||
} | ||
} | ||
|
||
var decodeRuneTests = []struct { | ||
r1, r2 rune | ||
want rune | ||
}{ | ||
{0xd800, 0xdc00, 0x10000}, | ||
{0xd800, 0xdc01, 0x10001}, | ||
{0xd808, 0xdf45, 0x12345}, | ||
{0xdbff, 0xdfff, 0x10ffff}, | ||
{0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted | ||
} | ||
|
||
func TestDecodeRune(t *testing.T) { | ||
for i, tt := range decodeRuneTests { | ||
got := DecodeRune(tt.r1, tt.r2) | ||
if got != tt.want { | ||
t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want) | ||
} | ||
} | ||
} | ||
|
||
var surrogateTests = []struct { | ||
r rune | ||
want bool | ||
}{ | ||
// from https://en.wikipedia.org/wiki/UTF-16 | ||
{'\u007A', false}, // LATIN SMALL LETTER Z | ||
{'\u6C34', false}, // CJK UNIFIED IDEOGRAPH-6C34 (water) | ||
{'\uFEFF', false}, // Byte Order Mark | ||
{'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point) | ||
{'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF | ||
{'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point) | ||
|
||
{rune(0xd7ff), false}, // surr1-1 | ||
{rune(0xd800), true}, // surr1 | ||
{rune(0xdc00), true}, // surr2 | ||
{rune(0xe000), false}, // surr3 | ||
{rune(0xdfff), true}, // surr3-1 | ||
} | ||
|
||
func TestIsSurrogate(t *testing.T) { | ||
for i, tt := range surrogateTests { | ||
got := IsSurrogate(tt.r) | ||
if got != tt.want { | ||
t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want) | ||
} | ||
} | ||
} | ||
|
||
func BenchmarkDecodeValidASCII(b *testing.B) { | ||
// "hello world" | ||
data := []uint16{104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100} | ||
for i := 0; i < b.N; i++ { | ||
Decode(data) | ||
} | ||
} | ||
|
||
func BenchmarkDecodeValidJapaneseChars(b *testing.B) { | ||
// "日本語日本語日本語" | ||
data := []uint16{26085, 26412, 35486, 26085, 26412, 35486, 26085, 26412, 35486} | ||
for i := 0; i < b.N; i++ { | ||
Decode(data) | ||
} | ||
} | ||
|
||
func BenchmarkDecodeRune(b *testing.B) { | ||
rs := make([]rune, 10) | ||
// U+1D4D0 to U+1D4D4: MATHEMATICAL BOLD SCRIPT CAPITAL LETTERS | ||
for i, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} { | ||
rs[2*i], rs[2*i+1] = EncodeRune(u) | ||
} | ||
|
||
b.ResetTimer() | ||
for i := 0; i < b.N; i++ { | ||
for j := 0; j < 5; j++ { | ||
DecodeRune(rs[2*j], rs[2*j+1]) | ||
} | ||
} | ||
} | ||
|
||
func BenchmarkEncodeValidASCII(b *testing.B) { | ||
data := []rune{'h', 'e', 'l', 'l', 'o'} | ||
for i := 0; i < b.N; i++ { | ||
Encode(data) | ||
} | ||
} | ||
|
||
func BenchmarkEncodeValidJapaneseChars(b *testing.B) { | ||
data := []rune{'日', '本', '語'} | ||
for i := 0; i < b.N; i++ { | ||
Encode(data) | ||
} | ||
} | ||
|
||
func BenchmarkEncodeRune(b *testing.B) { | ||
for i := 0; i < b.N; i++ { | ||
for _, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} { | ||
EncodeRune(u) | ||
} | ||
} | ||
} |