From 400aefbe0a14525b6ef9560ca58ce961955d4572 Mon Sep 17 00:00:00 2001 From: Norman Meier Date: Thu, 24 Aug 2023 19:45:41 +0200 Subject: [PATCH 1/4] feat(gnovm): add unicode/utf16 in stdlibs Signed-off-by: Norman Meier --- gnovm/docs/go-gno-compatibility.md | 2 +- gnovm/stdlibs/unicode/utf16/export_test.gno | 11 ++ gnovm/stdlibs/unicode/utf16/utf16.gno | 108 ++++++++++ gnovm/stdlibs/unicode/utf16/utf16_test.gno | 208 ++++++++++++++++++++ 4 files changed, 328 insertions(+), 1 deletion(-) create mode 100644 gnovm/stdlibs/unicode/utf16/export_test.gno create mode 100644 gnovm/stdlibs/unicode/utf16/utf16.gno create mode 100644 gnovm/stdlibs/unicode/utf16/utf16_test.gno diff --git a/gnovm/docs/go-gno-compatibility.md b/gnovm/docs/go-gno-compatibility.md index e3616861ca1..ff16df371a6 100644 --- a/gnovm/docs/go-gno-compatibility.md +++ b/gnovm/docs/go-gno-compatibility.md @@ -337,7 +337,7 @@ Additional native types: | time | TBD | | time/tzdata | TBD | | unicode | TBD | -| unicode/utf16 | TBD | +| unicode/utf16 | full | | unicode/utf8 | TBD | | unsafe | TBD | diff --git a/gnovm/stdlibs/unicode/utf16/export_test.gno b/gnovm/stdlibs/unicode/utf16/export_test.gno new file mode 100644 index 00000000000..e0c57f52aef --- /dev/null +++ b/gnovm/stdlibs/unicode/utf16/export_test.gno @@ -0,0 +1,11 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package utf16 + +// Extra names for constants so we can validate them during testing. +const ( + MaxRune = maxRune + ReplacementChar = replacementChar +) diff --git a/gnovm/stdlibs/unicode/utf16/utf16.gno b/gnovm/stdlibs/unicode/utf16/utf16.gno new file mode 100644 index 00000000000..1a881aa7695 --- /dev/null +++ b/gnovm/stdlibs/unicode/utf16/utf16.gno @@ -0,0 +1,108 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package utf16 implements encoding and decoding of UTF-16 sequences. +package utf16 + +// The conditions replacementChar==unicode.ReplacementChar and +// maxRune==unicode.MaxRune are verified in the tests. +// Defining them locally avoids this package depending on package unicode. + +const ( + replacementChar = '\uFFFD' // Unicode replacement character + maxRune = '\U0010FFFF' // Maximum valid Unicode code point. +) + +const ( + // 0xd800-0xdc00 encodes the high 10 bits of a pair. + // 0xdc00-0xe000 encodes the low 10 bits of a pair. + // the value is those 20 bits plus 0x10000. + surr1 = 0xd800 + surr2 = 0xdc00 + surr3 = 0xe000 + + surrSelf = 0x10000 +) + +// IsSurrogate reports whether the specified Unicode code point +// can appear in a surrogate pair. +func IsSurrogate(r rune) bool { + return surr1 <= r && r < surr3 +} + +// DecodeRune returns the UTF-16 decoding of a surrogate pair. +// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns +// the Unicode replacement code point U+FFFD. +func DecodeRune(r1, r2 rune) rune { + if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { + return (r1-surr1)<<10 | (r2 - surr2) + surrSelf + } + return replacementChar +} + +// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune. +// If the rune is not a valid Unicode code point or does not need encoding, +// EncodeRune returns U+FFFD, U+FFFD. +func EncodeRune(r rune) (r1, r2 rune) { + if r < surrSelf || r > maxRune { + return replacementChar, replacementChar + } + r -= surrSelf + return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff +} + +// Encode returns the UTF-16 encoding of the Unicode code point sequence s. +func Encode(s []rune) []uint16 { + n := len(s) + for _, v := range s { + if v >= surrSelf { + n++ + } + } + + a := make([]uint16, n) + n = 0 + for _, v := range s { + switch { + case 0 <= v && v < surr1, surr3 <= v && v < surrSelf: + // normal rune + a[n] = uint16(v) + n++ + case surrSelf <= v && v <= maxRune: + // needs surrogate sequence + r1, r2 := EncodeRune(v) + a[n] = uint16(r1) + a[n+1] = uint16(r2) + n += 2 + default: + a[n] = uint16(replacementChar) + n++ + } + } + return a[:n] +} + +// Decode returns the Unicode code point sequence represented +// by the UTF-16 encoding s. +func Decode(s []uint16) []rune { + a := make([]rune, len(s)) + n := 0 + for i := 0; i < len(s); i++ { + switch r := s[i]; { + case r < surr1, surr3 <= r: + // normal rune + a[n] = rune(r) + case surr1 <= r && r < surr2 && i+1 < len(s) && + surr2 <= s[i+1] && s[i+1] < surr3: + // valid surrogate sequence + a[n] = DecodeRune(rune(r), rune(s[i+1])) + i++ + default: + // invalid surrogate sequence + a[n] = replacementChar + } + n++ + } + return a[:n] +} diff --git a/gnovm/stdlibs/unicode/utf16/utf16_test.gno b/gnovm/stdlibs/unicode/utf16/utf16_test.gno new file mode 100644 index 00000000000..68e5deefb53 --- /dev/null +++ b/gnovm/stdlibs/unicode/utf16/utf16_test.gno @@ -0,0 +1,208 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package utf16_test + +import ( + "reflect" + "testing" + "unicode" + . "unicode/utf16" +) + +// Validate the constants redefined from unicode. +func TestConstants(t *testing.T) { + if MaxRune != unicode.MaxRune { + t.Errorf("utf16.maxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune) + } + if ReplacementChar != unicode.ReplacementChar { + t.Errorf("utf16.replacementChar is wrong: %x should be %x", ReplacementChar, unicode.ReplacementChar) + } +} + +type encodeTest struct { + in []rune + out []uint16 +} + +var encodeTests = []encodeTest{ + {[]rune{1, 2, 3, 4}, []uint16{1, 2, 3, 4}}, + { + []rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff}, + []uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff}, + }, + { + []rune{'a', 'b', 0xd7ff, 0xd800, 0xdfff, 0xe000, 0x110000, -1}, + []uint16{'a', 'b', 0xd7ff, 0xfffd, 0xfffd, 0xe000, 0xfffd, 0xfffd}, + }, +} + +func TestEncode(t *testing.T) { + for _, tt := range encodeTests { + out := Encode(tt.in) + if !reflect.DeepEqual(out, tt.out) { + t.Errorf("Encode(%x) = %x; want %x", tt.in, out, tt.out) + } + } +} + +func TestEncodeRune(t *testing.T) { + for i, tt := range encodeTests { + j := 0 + for _, r := range tt.in { + r1, r2 := EncodeRune(r) + if r < 0x10000 || r > unicode.MaxRune { + if j >= len(tt.out) { + t.Errorf("#%d: ran out of tt.out", i) + break + } + if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar { + t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2) + } + j++ + } else { + if j+1 >= len(tt.out) { + t.Errorf("#%d: ran out of tt.out", i) + break + } + if r1 != rune(tt.out[j]) || r2 != rune(tt.out[j+1]) { + t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1]) + } + j += 2 + dec := DecodeRune(r1, r2) + if dec != r { + t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r) + } + } + } + if j != len(tt.out) { + t.Errorf("#%d: EncodeRune didn't generate enough output", i) + } + } +} + +type decodeTest struct { + in []uint16 + out []rune +} + +var decodeTests = []decodeTest{ + {[]uint16{1, 2, 3, 4}, []rune{1, 2, 3, 4}}, + { + []uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff}, + []rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff}, + }, + {[]uint16{0xd800, 'a'}, []rune{0xfffd, 'a'}}, + {[]uint16{0xdfff}, []rune{0xfffd}}, +} + +func TestDecode(t *testing.T) { + for _, tt := range decodeTests { + out := Decode(tt.in) + if !reflect.DeepEqual(out, tt.out) { + t.Errorf("Decode(%x) = %x; want %x", tt.in, out, tt.out) + } + } +} + +var decodeRuneTests = []struct { + r1, r2 rune + want rune +}{ + {0xd800, 0xdc00, 0x10000}, + {0xd800, 0xdc01, 0x10001}, + {0xd808, 0xdf45, 0x12345}, + {0xdbff, 0xdfff, 0x10ffff}, + {0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted +} + +func TestDecodeRune(t *testing.T) { + for i, tt := range decodeRuneTests { + got := DecodeRune(tt.r1, tt.r2) + if got != tt.want { + t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want) + } + } +} + +var surrogateTests = []struct { + r rune + want bool +}{ + // from https://en.wikipedia.org/wiki/UTF-16 + {'\u007A', false}, // LATIN SMALL LETTER Z + {'\u6C34', false}, // CJK UNIFIED IDEOGRAPH-6C34 (water) + {'\uFEFF', false}, // Byte Order Mark + {'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point) + {'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF + {'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point) + + {rune(0xd7ff), false}, // surr1-1 + {rune(0xd800), true}, // surr1 + {rune(0xdc00), true}, // surr2 + {rune(0xe000), false}, // surr3 + {rune(0xdfff), true}, // surr3-1 +} + +func TestIsSurrogate(t *testing.T) { + for i, tt := range surrogateTests { + got := IsSurrogate(tt.r) + if got != tt.want { + t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want) + } + } +} + +func BenchmarkDecodeValidASCII(b *testing.B) { + // "hello world" + data := []uint16{104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100} + for i := 0; i < b.N; i++ { + Decode(data) + } +} + +func BenchmarkDecodeValidJapaneseChars(b *testing.B) { + // "日本語日本語日本語" + data := []uint16{26085, 26412, 35486, 26085, 26412, 35486, 26085, 26412, 35486} + for i := 0; i < b.N; i++ { + Decode(data) + } +} + +func BenchmarkDecodeRune(b *testing.B) { + rs := make([]rune, 10) + // U+1D4D0 to U+1D4D4: MATHEMATICAL BOLD SCRIPT CAPITAL LETTERS + for i, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} { + rs[2*i], rs[2*i+1] = EncodeRune(u) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for j := 0; j < 5; j++ { + DecodeRune(rs[2*j], rs[2*j+1]) + } + } +} + +func BenchmarkEncodeValidASCII(b *testing.B) { + data := []rune{'h', 'e', 'l', 'l', 'o'} + for i := 0; i < b.N; i++ { + Encode(data) + } +} + +func BenchmarkEncodeValidJapaneseChars(b *testing.B) { + data := []rune{'日', '本', '語'} + for i := 0; i < b.N; i++ { + Encode(data) + } +} + +func BenchmarkEncodeRune(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} { + EncodeRune(u) + } + } +} From d09c845e6ce5d1bec6a3290418b589d882cbfcd5 Mon Sep 17 00:00:00 2001 From: Norman Meier Date: Thu, 24 Aug 2023 19:46:03 +0200 Subject: [PATCH 2/4] feat: ujson Signed-off-by: Norman Meier --- examples/gno.land/p/demo/ujson/format.gno | 108 ++++ examples/gno.land/p/demo/ujson/gno.mod | 5 + examples/gno.land/p/demo/ujson/parse.gno | 569 ++++++++++++++++++ examples/gno.land/p/demo/ujson/strings.gno | 229 +++++++ examples/gno.land/p/demo/ujson/tables.gno | 216 +++++++ examples/gno.land/p/demo/ujson/ujson_test.gno | 161 +++++ 6 files changed, 1288 insertions(+) create mode 100644 examples/gno.land/p/demo/ujson/format.gno create mode 100644 examples/gno.land/p/demo/ujson/gno.mod create mode 100644 examples/gno.land/p/demo/ujson/parse.gno create mode 100644 examples/gno.land/p/demo/ujson/strings.gno create mode 100644 examples/gno.land/p/demo/ujson/tables.gno create mode 100644 examples/gno.land/p/demo/ujson/ujson_test.gno diff --git a/examples/gno.land/p/demo/ujson/format.gno b/examples/gno.land/p/demo/ujson/format.gno new file mode 100644 index 00000000000..cbd6600bdd4 --- /dev/null +++ b/examples/gno.land/p/demo/ujson/format.gno @@ -0,0 +1,108 @@ +package ujson + +// This package strives to have the same behavior as json.Marshal but does not support all types and returns strings + +import ( + "std" + "strconv" + "strings" + + "gno.land/p/demo/avl" +) + +type JSONAble interface { + ToJSON() string +} + +type FormatKV struct { + Key string + Value interface{} + Raw bool +} + +// does not work for slices, use FormatSlice instead +func FormatAny(p interface{}) string { + switch p.(type) { + case std.Address: + return FormatString(string(p.(std.Address))) + case *avl.Tree: + return FormatAVLTree(p.(*avl.Tree)) + case avl.Tree: + return FormatAVLTree(&p.(avl.Tree)) + case JSONAble: + return p.(JSONAble).ToJSON() + case string: + return FormatString(p.(string)) + case uint64: + return FormatUint64(p.(uint64)) + case uint32: + return FormatUint64(uint64(p.(uint32))) + case uint: + return FormatUint64(uint64(p.(uint))) + case int64: + return FormatInt64(p.(int64)) + case int32: + return FormatInt64(int64(p.(int32))) + case int: + return FormatInt64(int64(p.(int))) + case float32: + panic("float32 not implemented") + case float64: + panic("float64 not implemented") + case bool: + return FormatBool(p.(bool)) + default: + return "null" + } +} + +func FormatUint64(i uint64) string { + return strconv.FormatUint(i, 10) +} + +func FormatInt64(i int64) string { + return strconv.FormatInt(i, 10) +} + +func FormatSlice(s []interface{}) string { + elems := make([]string, len(s)) + for i, elem := range s { + elems[i] = FormatAny(elem) + } + return "[" + strings.Join(elems, ",") + "]" +} + +func FormatObject(kv []FormatKV) string { + elems := make([]string, len(kv)) + i := 0 + for _, elem := range kv { + var val string + if elem.Raw { + val = elem.Value.(string) + } else { + val = FormatAny(elem.Value) + } + elems[i] = FormatString(elem.Key) + ":" + val + i++ + } + return "{" + strings.Join(elems, ",") + "}" +} + +func FormatBool(b bool) string { + if b { + return "true" + } + return "false" +} + +func FormatAVLTree(t *avl.Tree) string { + if t == nil { + return "{}" + } + kv := make([]FormatKV, 0, t.Size()) + t.Iterate("", "", func(key string, value interface{}) bool { + kv = append(kv, FormatKV{key, value, false}) + return false + }) + return FormatObject(kv) +} diff --git a/examples/gno.land/p/demo/ujson/gno.mod b/examples/gno.land/p/demo/ujson/gno.mod new file mode 100644 index 00000000000..d14bc186682 --- /dev/null +++ b/examples/gno.land/p/demo/ujson/gno.mod @@ -0,0 +1,5 @@ +module gno.land/p/demo/ujson + +require ( + "gno.land/p/demo/avl" v0.0.0-latest +) \ No newline at end of file diff --git a/examples/gno.land/p/demo/ujson/parse.gno b/examples/gno.land/p/demo/ujson/parse.gno new file mode 100644 index 00000000000..c4e97bc21bb --- /dev/null +++ b/examples/gno.land/p/demo/ujson/parse.gno @@ -0,0 +1,569 @@ +package ujson + +import ( + "std" + "strconv" + "strings" + + "gno.land/p/demo/avl" +) + +// https://stackoverflow.com/a/4150626 +const whitespaces = " \t\n\r" + +type FromJSONAble interface { + FromJSON(ast *JSONASTNode) +} + +// does not work for slices, use ast exploration instead +func (ast *JSONASTNode) ParseAny(ptr *interface{}) { + switch ptr.(type) { + case *std.Address: + *ptr.(*std.Address) = std.Address(ParseString(ast.Value)) + case **avl.Tree: + panic("*avl.Tree not implemented, there is no way to know the type of the tree values, use ParseAVLTree instead") + case *avl.Tree: + panic("avl.Tree not implemented, there is no way to know the type of the tree values, use ParseAVLTree instead") + case *string: + if ast.Kind != JSONKindValue { + panic("not a value") + } + if ast.ValueKind != JSONTokenKindString { + panic("not a string") + } + *ptr.(*string) = ParseString(ast.Value) + case *uint64: + if ast.Kind != JSONKindValue { + panic("not a value") + } + if ast.ValueKind != JSONTokenKindNumber { + panic("not a number") + } + *ptr.(*uint64) = ParseUint64(ast.Value) + case *uint32: + if ast.Kind != JSONKindValue { + panic("not a value") + } + if ast.ValueKind != JSONTokenKindNumber { + panic("not a number") + } + *ptr.(*uint32) = uint32(ParseUint64(ast.Value)) + case *uint: + if ast.Kind != JSONKindValue { + panic("not a value") + } + if ast.ValueKind != JSONTokenKindNumber { + panic("not a number") + } + *ptr.(*uint) = uint(ParseUint64(ast.Value)) + case *int64: + if ast.Kind != JSONKindValue { + panic("not a value") + } + if ast.ValueKind != JSONTokenKindNumber { + panic("not a number") + } + *ptr.(*int64) = ParseInt64(ast.Value) + case *int32: + if ast.Kind != JSONKindValue { + panic("not a value") + } + if ast.ValueKind != JSONTokenKindNumber { + panic("not a number") + } + *ptr.(*int32) = int32(ParseInt64(ast.Value)) + case *int: + if ast.Kind != JSONKindValue { + panic("not a value") + } + if ast.ValueKind != JSONTokenKindNumber { + panic("not a number") + } + *ptr.(*int) = int(ParseInt64(ast.Value)) + case *float64: + panic("float64 not implemented") + case *float32: + panic("float32 not implemented") + case *bool: + if ast.Kind != JSONKindValue { + panic("not a value") + } + if ast.ValueKind != JSONTokenKindTrue && ast.ValueKind != JSONTokenKindFalse { + panic("not a bool") + } + *ptr.(*bool) = ast.ValueKind == JSONTokenKindTrue + case *FromJSONAble: + (*(ptr.(*FromJSONAble))).FromJSON(ast) + case FromJSONAble: + ptr.(FromJSONAble).FromJSON(ast) + case **JSONASTNode: + *ptr.(**JSONASTNode) = ast + default: + if ast.Kind == JSONKindValue && ast.ValueKind == JSONTokenKindNull { + *ptr = nil + return + } + panic("type not defined for `" + ast.String() + "`") + } +} + +func ParseUint64(s string) uint64 { + val, err := strconv.Atoi(s) + if err != nil { + panic(err) + } + return uint64(val) +} + +func ParseInt64(s string) int64 { + val, err := strconv.Atoi(s) + if err != nil { + panic(err) + } + return int64(val) +} + +type ParseKV struct { + Key string + Value *interface{} + ArrayParser func(children []*JSONASTNode) + ObjectParser func(children []*JSONASTKV) + CustomParser func(node *JSONASTNode) +} + +func ParseAny(s string, val *interface{}) { + tokens := tokenize(s) + if len(tokens) == 0 { + panic("empty json") + } + remainingTokens, ast := parseAST(tokens) + if len(remainingTokens) > 0 { + panic("invalid json") + } + ast.ParseAny(val) +} + +func (ast *JSONASTNode) ParseObject(kv []*ParseKV) { + if ast.Kind != JSONKindObject { + panic("not an object") + } + for _, elem := range kv { + for i, child := range ast.ObjectChildren { + if child.Key == elem.Key { + if elem.ArrayParser != nil { + if child.Value.Kind != JSONKindArray { + panic("not an array") + } + elem.ArrayParser(child.Value.ArrayChildren) + } else if elem.ObjectParser != nil { + if child.Value.Kind != JSONKindObject { + panic("not an object") + } + elem.ObjectParser(child.Value.ObjectChildren) + } else if elem.CustomParser != nil { + elem.CustomParser(child.Value) + } else { + child.Value.ParseAny(elem.Value) + } + break + } + if i == (len(ast.ObjectChildren) - 1) { + panic("invalid key `" + elem.Key + "` in object `" + ast.String() + "`") + } + } + } +} + +func ParseSlice(s string) []*JSONASTNode { + tokens := tokenize(s) + if len(tokens) == 0 { + panic("empty json") + } + remainingTokens, ast := parseAST(tokens) + if len(remainingTokens) > 0 { + panic("invalid json") + } + return ast.ParseSlice() +} + +func (ast *JSONASTNode) ParseSlice() []*JSONASTNode { + if ast.Kind != JSONKindArray { + panic("not an array") + } + return ast.ArrayChildren +} + +func countWhitespaces(s string) int { + i := 0 + for i < len(s) { + if strings.ContainsRune(whitespaces, int32(s[i])) { + i++ + } else { + break + } + } + return i +} + +func JSONTokensString(tokens []*JSONToken) string { + s := "" + for _, token := range tokens { + s += token.Raw + } + return s +} + +func (node *JSONASTNode) String() string { + if node == nil { + return "nil" + } + switch node.Kind { + case JSONKindValue: + return node.Value + case JSONKindArray: + s := "[" + for i, child := range node.ArrayChildren { + if i > 0 { + s += "," + } + s += child.String() + } + s += "]" + return s + case JSONKindObject: + s := "{" + for i, child := range node.ObjectChildren { + if i > 0 { + s += "," + } + s += `"` + child.Key + `":` + child.Value.String() + } + s += "}" + return s + default: + panic("invalid json") + } +} + +func TokenizeAndParse(s string) *JSONASTNode { + tokens := tokenize(s) + if len(tokens) == 0 { + panic("empty json") + } + remainingTokens, ast := parseAST(tokens) + if len(remainingTokens) > 0 { + panic("invalid json") + } + return ast +} + +func parseAST(tokens []*JSONToken) (tkn []*JSONToken, tree *JSONASTNode) { + if len(tokens) == 0 { + panic("empty json") + } + + switch tokens[0].Kind { + + case JSONTokenKindString: + return tokens[1:], &JSONASTNode{Kind: JSONKindValue, ValueKind: tokens[0].Kind, Value: tokens[0].Raw} + case JSONTokenKindNumber: + return tokens[1:], &JSONASTNode{Kind: JSONKindValue, ValueKind: tokens[0].Kind, Value: tokens[0].Raw} + case JSONTokenKindTrue: + return tokens[1:], &JSONASTNode{Kind: JSONKindValue, ValueKind: tokens[0].Kind, Value: tokens[0].Raw} + case JSONTokenKindFalse: + return tokens[1:], &JSONASTNode{Kind: JSONKindValue, ValueKind: tokens[0].Kind, Value: tokens[0].Raw} + case JSONTokenKindNull: + return tokens[1:], &JSONASTNode{Kind: JSONKindValue, ValueKind: tokens[0].Kind, Value: tokens[0].Raw} + + case JSONTokenKindOpenArray: + arrayChildren := []*JSONASTNode{} + tokens = tokens[1:] + for len(tokens) > 0 { + if tokens[0].Kind == JSONTokenKindCloseArray { + return tokens[1:], &JSONASTNode{Kind: JSONKindArray, ArrayChildren: arrayChildren} + } + var child *JSONASTNode + tokens, child = parseAST(tokens) + arrayChildren = append(arrayChildren, child) + if len(tokens) == 0 { + panic("exepected more tokens in array") + } + if tokens[0].Kind == JSONTokenKindComma { + tokens = tokens[1:] + } else if tokens[0].Kind == JSONTokenKindCloseArray { + return tokens[1:], &JSONASTNode{Kind: JSONKindArray, ArrayChildren: arrayChildren} + } else { + panic("unexpected token in array after value `" + tokens[0].Raw + "`") + } + } + + case JSONTokenKindOpenObject: + objectChildren := []*JSONASTKV{} + if len(tokens) < 2 { + panic("objects must have at least 2 tokens") + } + tokens = tokens[1:] + for len(tokens) > 0 { + if tokens[0].Kind == JSONTokenKindCloseObject { + return tokens[1:], &JSONASTNode{Kind: JSONKindObject, ObjectChildren: objectChildren} + } + if tokens[0].Kind != JSONTokenKindString { + panic("invalid json") + } + key := tokens[0].Raw + tokens = tokens[1:] + if len(tokens) == 0 { + panic("exepected more tokens in object") + } + if tokens[0].Kind != JSONTokenKindColon { + panic("expected :") + } + tokens = tokens[1:] + if len(tokens) == 0 { + panic("exepected more tokens in object after :") + } + var value *JSONASTNode + tokens, value = parseAST(tokens) + objectChildren = append(objectChildren, &JSONASTKV{Key: ParseString(key), Value: value}) + if len(tokens) == 0 { + panic("exepected more tokens in object after value") + } + if tokens[0].Kind == JSONTokenKindComma { + tokens = tokens[1:] + } else if tokens[0].Kind == JSONTokenKindCloseObject { + return tokens[1:], &JSONASTNode{Kind: JSONKindObject, ObjectChildren: objectChildren} + } else { + panic("unexpected token in object after value `" + tokens[0].Raw + "`") + } + } + + default: + panic("unexpected token `" + tokens[0].Raw + "`") + } +} + +func tokenize(s string) []*JSONToken { + tokens := []*JSONToken{} + for len(s) > 0 { + var token *JSONToken + s, token = tokenizeOne(s) + if token.Kind != JSONTokenKindSpaces { + tokens = append(tokens, token) + } + } + return tokens +} + +func (node *JSONASTNode) ParseAVLTree(t *interface{}) *avl.Tree { + if node.Kind != JSONKindObject { + panic("not an object") + } + tree := avl.NewTree() + for _, child := range node.ObjectChildren { + child.Value.ParseAny(t) + tree.Set(child.Key, *t) + } + return tree +} + +func ParseAVLTree(s string, t *interface{}) *avl.Tree { + return TokenizeAndParse(s).ParseAVLTree(t) +} + +func tokenizeOne(s string) (string, *JSONToken) { + if len(s) == 0 { + panic("invalid token") + } + spacesCount := countWhitespaces(s) + if spacesCount > 0 { + spaces := s[:spacesCount] + return s[spacesCount:], &JSONToken{Kind: JSONTokenKindSpaces, Raw: spaces} + } + switch s[0] { + case '"': + return parseStringToken(s) + case 't': + return parseKeyword(s, "true", JSONTokenKindTrue) + case 'f': + return parseKeyword(s, "false", JSONTokenKindFalse) + case 'n': + return parseKeyword(s, "null", JSONTokenKindNull) + case '{': + return s[1:], &JSONToken{Kind: JSONTokenKindOpenObject, Raw: "{"} + case '[': + return s[1:], &JSONToken{Kind: JSONTokenKindOpenArray, Raw: "["} + case ':': + return s[1:], &JSONToken{Kind: JSONTokenKindColon, Raw: ":"} + case ',': + return s[1:], &JSONToken{Kind: JSONTokenKindComma, Raw: ","} + case ']': + return s[1:], &JSONToken{Kind: JSONTokenKindCloseArray, Raw: "]"} + case '}': + return s[1:], &JSONToken{Kind: JSONTokenKindCloseObject, Raw: "}"} + default: + return parseNumber(s) + } +} + +func parseKeyword(s string, keyword string, kind JSONTokenKind) (string, *JSONToken) { + if len(s) < len(keyword) { + panic("invalid keyword") + } + if s[:len(keyword)] != keyword { + panic("invalid keyword") + } + return s[len(keyword):], &JSONToken{Kind: kind, Raw: keyword} +} + +func parseStringToken(s string) (string, *JSONToken) { + if (len(s) < 2) || (s[0] != '"') { + panic("invalid string") + } + for i := 1; i < len(s); i++ { + if s[i] == '"' { + return s[i+1:], &JSONToken{Kind: JSONTokenKindString, Raw: s[:i+1]} + } + } + panic("invalid string") +} + +// copiloted +func parseNumber(s string) (string, *JSONToken) { + if len(s) == 0 { + panic("invalid number") + } + i := 0 + if s[i] == '-' { + i++ + } + if i == len(s) { + panic("invalid number") + } + if s[i] == '0' { + i++ + } else if ('1' <= s[i]) && (s[i] <= '9') { + i++ + for (i < len(s)) && ('0' <= s[i]) && (s[i] <= '9') { + i++ + } + } else { + panic("invalid number") + } + if i == len(s) { + return s[i:], &JSONToken{Kind: JSONTokenKindNumber, Raw: s} + } + if s[i] == '.' { + i++ + if i == len(s) { + panic("invalid number") + } + if ('0' <= s[i]) && (s[i] <= '9') { + i++ + for (i < len(s)) && ('0' <= s[i]) && (s[i] <= '9') { + i++ + } + } else { + panic("invalid number") + } + } + if i == len(s) { + return s[i:], &JSONToken{Kind: JSONTokenKindNumber, Raw: s} + } + if (s[i] == 'e') || (s[i] == 'E') { + i++ + if i == len(s) { + panic("invalid number") + } + if (s[i] == '+') || (s[i] == '-') { + i++ + } + if i == len(s) { + panic("invalid number") + } + if ('0' <= s[i]) && (s[i] <= '9') { + i++ + for (i < len(s)) && ('0' <= s[i]) && (s[i] <= '9') { + i++ + } + } else { + panic("invalid number") + } + } + return s[i:], &JSONToken{Kind: JSONTokenKindNumber, Raw: s[:i]} +} + +type JSONTokenKind int + +type JSONKind int + +const ( + JSONKindUnknown JSONKind = iota + JSONKindValue + JSONKindObject + JSONKindArray +) + +type JSONASTNode struct { + Kind JSONKind + ArrayChildren []*JSONASTNode + ObjectChildren []*JSONASTKV + ValueKind JSONTokenKind + Value string +} + +type JSONASTKV struct { + Key string + Value *JSONASTNode +} + +const ( + JSONTokenKindUnknown JSONTokenKind = iota + JSONTokenKindString + JSONTokenKindNumber + JSONTokenKindTrue + JSONTokenKindFalse + JSONTokenKindSpaces + JSONTokenKindComma + JSONTokenKindColon + JSONTokenKindOpenArray + JSONTokenKindCloseArray + JSONTokenKindOpenObject + JSONTokenKindCloseObject + JSONTokenKindNull +) + +func (k JSONTokenKind) String() string { + switch k { + case JSONTokenKindString: + return "string" + case JSONTokenKindNumber: + return "number" + case JSONTokenKindTrue: + return "true" + case JSONTokenKindFalse: + return "false" + case JSONTokenKindSpaces: + return "spaces" + case JSONTokenKindComma: + return "comma" + case JSONTokenKindColon: + return "colon" + case JSONTokenKindOpenArray: + return "open-array" + case JSONTokenKindCloseArray: + return "close-array" + case JSONTokenKindOpenObject: + return "open-object" + case JSONTokenKindCloseObject: + return "close-object" + case JSONTokenKindNull: + return "null" + default: + return "unknown" + } +} + +type JSONToken struct { + Kind JSONTokenKind + Raw string +} diff --git a/examples/gno.land/p/demo/ujson/strings.gno b/examples/gno.land/p/demo/ujson/strings.gno new file mode 100644 index 00000000000..14922f4b6db --- /dev/null +++ b/examples/gno.land/p/demo/ujson/strings.gno @@ -0,0 +1,229 @@ +package ujson + +import ( + "unicode" + "unicode/utf16" + "unicode/utf8" +) + +// Ported from https://cs.opensource.google/go/go/+/refs/tags/go1.20.6:src/encoding/json/encode.go +func FormatString(s string) string { + const escapeHTML = true + e := `"` // e.WriteByte('"') + start := 0 + for i := 0; i < len(s); { + if b := s[i]; b < utf8.RuneSelf { + if htmlSafeSet[b] || (!escapeHTML && safeSet[b]) { + i++ + continue + } + if start < i { + e += s[start:i] // e.WriteString(s[start:i]) + } + e += "\\" // e.WriteByte('\\') + switch b { + case '\\', '"': + e += string(b) // e.WriteByte(b) + case '\n': + e += "n" // e.WriteByte('n') + case '\r': + e += "r" // e.WriteByte('r') + case '\t': + e += "t" // e.WriteByte('t') + default: + // This encodes bytes < 0x20 except for \t, \n and \r. + // If escapeHTML is set, it also escapes <, >, and & + // because they can lead to security holes when + // user-controlled strings are rendered into JSON + // and served to some browsers. + e += `u00` // e.WriteString(`u00`) + e += string(hex[b>>4]) // e.WriteByte(hex[b>>4]) + e += string(hex[b&0xF]) // e.WriteByte(hex[b&0xF]) + } + i++ + start = i + continue + } + c, size := utf8.DecodeRuneInString(s[i:]) + if c == utf8.RuneError && size == 1 { + if start < i { + e += s[start:i] // e.WriteString(s[start:i]) + } + e += `\ufffd` // e.WriteString(`\ufffd`) + i += size + start = i + continue + } + // U+2028 is LINE SEPARATOR. + // U+2029 is PARAGRAPH SEPARATOR. + // They are both technically valid characters in JSON strings, + // but don't work in JSONP, which has to be evaluated as JavaScript, + // and can lead to security holes there. It is valid JSON to + // escape them, so we do so unconditionally. + // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. + if c == '\u2028' || c == '\u2029' { + if start < i { + e += s[start:i] // e.WriteString(s[start:i]) + } + e += `\u202` // e.WriteString(`\u202`) + e += string(hex[c&0xF]) // e.WriteByte(hex[c&0xF]) + i += size + start = i + continue + } + i += size + } + if start < len(s) { + e += s[start:] // e.WriteString(s[start:]) + } + e += `"` // e.WriteByte('"') + return e +} + +// Ported from https://cs.opensource.google/go/go/+/refs/tags/go1.20.6:src/encoding/json/decode.go +// unquote converts a quoted JSON string literal s into an actual string t. +// The rules are different than for Go, so cannot use strconv.Unquote. +func ParseString(s string) string { + o, ok := unquoteBytes([]byte(s)) + if !ok { + panic("invalid string") + } + return string(o) +} + +func unquoteBytes(s []byte) (t []byte, ok bool) { + if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' { + return + } + s = s[1 : len(s)-1] + + // Check for unusual characters. If there are none, + // then no unquoting is needed, so return a slice of the + // original bytes. + r := 0 + for r < len(s) { + c := s[r] + if c == '\\' || c == '"' || c < ' ' { + break + } + if c < utf8.RuneSelf { + r++ + continue + } + rr, size := utf8.DecodeRune(s[r:]) + if rr == utf8.RuneError && size == 1 { + break + } + r += size + } + if r == len(s) { + return s, true + } + + b := make([]byte, len(s)+2*utf8.UTFMax) + w := copy(b, s[0:r]) + for r < len(s) { + // Out of room? Can only happen if s is full of + // malformed UTF-8 and we're replacing each + // byte with RuneError. + if w >= len(b)-2*utf8.UTFMax { + nb := make([]byte, (len(b)+utf8.UTFMax)*2) + copy(nb, b[0:w]) + b = nb + } + switch c := s[r]; { + case c == '\\': + r++ + if r >= len(s) { + return + } + switch s[r] { + default: + return + case '"', '\\', '/', '\'': + b[w] = s[r] + r++ + w++ + case 'b': + b[w] = '\b' + r++ + w++ + case 'f': + b[w] = '\f' + r++ + w++ + case 'n': + b[w] = '\n' + r++ + w++ + case 'r': + b[w] = '\r' + r++ + w++ + case 't': + b[w] = '\t' + r++ + w++ + case 'u': + r-- + rr := getu4(s[r:]) + if rr < 0 { + return + } + r += 6 + if utf16.IsSurrogate(rr) { + rr1 := getu4(s[r:]) + if dec := utf16.DecodeRune(rr, rr1); dec != unicode.ReplacementChar { + // A valid pair; consume. + r += 6 + w += utf8.EncodeRune(b[w:], dec) + break + } + // Invalid surrogate; fall back to replacement rune. + rr = unicode.ReplacementChar + } + + } + + // Quote, control characters are invalid. + case c == '"', c < ' ': + return + + // ASCII + case c < utf8.RuneSelf: + b[w] = c + r++ + w++ + + // Coerce to well-formed UTF-8. + default: + rr, size := utf8.DecodeRune(s[r:]) + r += size + w += utf8.EncodeRune(b[w:], rr) + } + } + return b[0:w], true +} + +// getu4 decodes \uXXXX from the beginning of s, returning the hex value, +// or it returns -1. +func getu4(s []byte) rune { + if len(s) < 6 || s[0] != '\\' || s[1] != 'u' { + return -1 + } + var r rune + for _, c := range s[2:6] { + switch { + case '0' <= c && c <= '9': + c = c - '0' + case 'a' <= c && c <= 'f': + c = c - 'a' + 10 + case 'A' <= c && c <= 'F': + c = c - 'A' + 10 + default: + return -1 + } + r = r*16 + rune(c) + } + return r +} diff --git a/examples/gno.land/p/demo/ujson/tables.gno b/examples/gno.land/p/demo/ujson/tables.gno new file mode 100644 index 00000000000..1ec2db8d917 --- /dev/null +++ b/examples/gno.land/p/demo/ujson/tables.gno @@ -0,0 +1,216 @@ +package ujson + +import "unicode/utf8" + +var hex = "0123456789abcdef" + +// safeSet holds the value true if the ASCII character with the given array +// position can be represented inside a JSON string without any further +// escaping. +// +// All values are true except for the ASCII control characters (0-31), the +// double quote ("), and the backslash character ("\"). +var safeSet = [utf8.RuneSelf]bool{ + ' ': true, + '!': true, + '"': false, + '#': true, + '$': true, + '%': true, + '&': true, + '\'': true, + '(': true, + ')': true, + '*': true, + '+': true, + ',': true, + '-': true, + '.': true, + '/': true, + '0': true, + '1': true, + '2': true, + '3': true, + '4': true, + '5': true, + '6': true, + '7': true, + '8': true, + '9': true, + ':': true, + ';': true, + '<': true, + '=': true, + '>': true, + '?': true, + '@': true, + 'A': true, + 'B': true, + 'C': true, + 'D': true, + 'E': true, + 'F': true, + 'G': true, + 'H': true, + 'I': true, + 'J': true, + 'K': true, + 'L': true, + 'M': true, + 'N': true, + 'O': true, + 'P': true, + 'Q': true, + 'R': true, + 'S': true, + 'T': true, + 'U': true, + 'V': true, + 'W': true, + 'X': true, + 'Y': true, + 'Z': true, + '[': true, + '\\': false, + ']': true, + '^': true, + '_': true, + '`': true, + 'a': true, + 'b': true, + 'c': true, + 'd': true, + 'e': true, + 'f': true, + 'g': true, + 'h': true, + 'i': true, + 'j': true, + 'k': true, + 'l': true, + 'm': true, + 'n': true, + 'o': true, + 'p': true, + 'q': true, + 'r': true, + 's': true, + 't': true, + 'u': true, + 'v': true, + 'w': true, + 'x': true, + 'y': true, + 'z': true, + '{': true, + '|': true, + '}': true, + '~': true, + '\u007f': true, +} + +// htmlSafeSet holds the value true if the ASCII character with the given +// array position can be safely represented inside a JSON string, embedded +// inside of HTML