From ea62645b3c79b2dd1bfc745ea30be72f47d08050 Mon Sep 17 00:00:00 2001 From: gagdiez Date: Wed, 22 Nov 2023 11:40:55 +0100 Subject: [PATCH] fix: utf8 without TextEncoder/Decoder --- borsh-ts/deserialize.ts | 23 +++++++++++++-- borsh-ts/serialize.ts | 31 +++++++++++++++---- borsh-ts/test/(de)serialize.test.js | 4 +-- borsh-ts/utils.ts | 30 +++++++++++++++++++ lib/cjs/deserialize.js | 46 +++++++++++++---------------- lib/cjs/serialize.js | 25 +++++++++++++--- lib/cjs/utils.d.ts | 1 + lib/cjs/utils.js | 31 ++++++++++++++++++- lib/esm/deserialize.js | 23 +++++++++++++-- lib/esm/serialize.js | 25 +++++++++++++--- lib/esm/utils.d.ts | 1 + lib/esm/utils.js | 28 ++++++++++++++++++ lib/types/utils.d.ts | 1 + 13 files changed, 222 insertions(+), 47 deletions(-) diff --git a/borsh-ts/deserialize.ts b/borsh-ts/deserialize.ts index cf87b415..ec0137df 100644 --- a/borsh-ts/deserialize.ts +++ b/borsh-ts/deserialize.ts @@ -1,8 +1,6 @@ import { ArrayType, DecodeTypes, MapType, IntegerType, OptionType, Schema, SetType, StructType, integers, EnumType } from './types.js'; import { DecodeBuffer } from './buffer.js'; -import * as utfUtil from 'util'; - export class BorshDeserializer { buffer: DecodeBuffer; @@ -56,7 +54,26 @@ export class BorshDeserializer { decode_string(): string { const len: number = this.decode_integer('u32') as number; const buffer = new Uint8Array(this.buffer.consume_bytes(len)); - return new utfUtil.TextDecoder().decode(buffer); + + // decode utf-8 string without using TextDecoder + // first get all bytes to single byte code points + const codePoints = []; + for (let i = 0; i < len; ++i) { + const byte = buffer[i]; + if (byte < 0x80) { + codePoints.push(byte); + } else if (byte < 0xE0) { + codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F)); + } else if (byte < 0xF0) { + codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F)); + } else { + const codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F); + codePoints.push(codePoint); + } + } + + // then decode code points to utf-8 + return String.fromCodePoint(...codePoints); } decode_boolean(): boolean { diff --git a/borsh-ts/serialize.ts b/borsh-ts/serialize.ts index 3b5b2ab8..3da73d4c 100644 --- a/borsh-ts/serialize.ts +++ b/borsh-ts/serialize.ts @@ -2,8 +2,6 @@ import { ArrayType, MapType, IntegerType, OptionType, Schema, SetType, StructTyp import { EncodeBuffer } from './buffer.js'; import * as utils from './utils.js'; -import * as utfUtil from 'util'; - export class BorshSerializer { encoded: EncodeBuffer; fieldPath: string[]; @@ -63,13 +61,34 @@ export class BorshSerializer { encode_string(value: unknown): void { this.checkTypes && utils.expect_type(value, 'string', this.fieldPath); - - // encode to utf8 bytes - const utf8Bytes = new utfUtil.TextEncoder().encode(value as string); + const _value = value as string; + + // encode to utf8 bytes without using TextEncoder + const utf8Bytes: number[] = []; + for (let i = 0; i < _value.length; i++) { + let charCode = _value.charCodeAt(i); + + if (charCode < 0x80) { + utf8Bytes.push(charCode); + } else if (charCode < 0x800) { + utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f)); + } else if (charCode < 0xd800 || charCode >= 0xe000) { + utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f)); + } else { + i++; + charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff)); + utf8Bytes.push( + 0xf0 | (charCode >> 18), + 0x80 | ((charCode >> 12) & 0x3f), + 0x80 | ((charCode >> 6) & 0x3f), + 0x80 | (charCode & 0x3f), + ); + } + } // 4 bytes for length + string bytes this.encoded.store_value(utf8Bytes.length, 'u32'); - this.encoded.store_bytes(utf8Bytes); + this.encoded.store_bytes(new Uint8Array(utf8Bytes)); } encode_boolean(value: unknown): void { diff --git a/borsh-ts/test/(de)serialize.test.js b/borsh-ts/test/(de)serialize.test.js index 42d171fa..1a44a054 100644 --- a/borsh-ts/test/(de)serialize.test.js +++ b/borsh-ts/test/(de)serialize.test.js @@ -41,8 +41,8 @@ test('serialize booleans', async () => { test('serialize strings', async () => { check_roundtrip('h"i', 'string', [3, 0, 0, 0, 104, 34, 105]); check_roundtrip('Chévere', 'string', [8, 0, 0, 0, 67, 104, 195, 169, 118, 101, 114, 101]); - check_roundtrip('👍', 'string', [4, 0, 0, 0, 240, 159, 145, 141]); - check_roundtrip('óñ 漢', 'string', [8, 0, 0, 0, 195, 179, 195, 177, 32, 230, 188, 162]); + check_roundtrip('!ǬЇЉي࠺👍ઠ൧࿄ሒᘻᏠᬅᡝ࠻', 'string', [43, 0, 0, 0, 33, 199, 172, 208, 135, 208, 137, 217, 138, 224, 160, 186, 240, 159, 145, 141, 224, 170, 160, 224, 181, 167, 224, 191, 132, 225, 136, 146, 225, 152, 187, 225, 143, 160, 225, 172, 133, 225, 161, 157, 224, 160, 187]); + check_roundtrip('óñ@‡؏ث 漢࠶⭐🔒􀀀', 'string', [30, 0, 0, 0, 195, 179, 195, 177, 64, 226, 128, 161, 216, 143, 216, 171, 32, 230, 188, 162, 224, 160, 182, 226, 173, 144, 240, 159, 148, 146, 244, 128, 128, 128]); }); test('serialize floats', async () => { diff --git a/borsh-ts/utils.ts b/borsh-ts/utils.ts index 778667c4..3e834b42 100644 --- a/borsh-ts/utils.ts +++ b/borsh-ts/utils.ts @@ -118,4 +118,34 @@ function validate_struct_schema(schema: { [key: string]: Schema }): void { for (const key in schema) { validate_schema(schema[key]); } +} + +// utf-8 encode +export function encodeCodePoint(codePoint): number[] { + if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence + return [codePoint]; + } + let symbol: number[] = []; + if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence + symbol = [((codePoint >> 6) & 0x1F) | 0xC0]; + } + else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence + symbol = [ + ((codePoint >> 12) & 0x0F) | 0xE0, + createByte(codePoint, 6) + ]; + } + else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence + symbol = [ + ((codePoint >> 18) & 0x07) | 0xF0, + createByte(codePoint, 12), + createByte(codePoint, 6) + ]; + } + symbol.push((codePoint & 0x3F) | 0x80); + return symbol; +} + +function createByte(codePoint, shift): number { + return ((codePoint >> shift) & 0x3F) | 0x80; } \ No newline at end of file diff --git a/lib/cjs/deserialize.js b/lib/cjs/deserialize.js index c3f0b297..67e09864 100644 --- a/lib/cjs/deserialize.js +++ b/lib/cjs/deserialize.js @@ -1,32 +1,8 @@ "use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); - __setModuleDefault(result, mod); - return result; -}; exports.__esModule = true; exports.BorshDeserializer = void 0; var types_js_1 = require("./types.js"); var buffer_js_1 = require("./buffer.js"); -var utfUtil = __importStar(require("util")); var BorshDeserializer = /** @class */ (function () { function BorshDeserializer(bufferArray) { this.buffer = new buffer_js_1.DecodeBuffer(bufferArray); @@ -79,7 +55,27 @@ var BorshDeserializer = /** @class */ (function () { BorshDeserializer.prototype.decode_string = function () { var len = this.decode_integer('u32'); var buffer = new Uint8Array(this.buffer.consume_bytes(len)); - return new utfUtil.TextDecoder().decode(buffer); + // decode utf-8 string without using TextDecoder + // first get all bytes to single byte code points + var codePoints = []; + for (var i = 0; i < len; ++i) { + var byte = buffer[i]; + if (byte < 0x80) { + codePoints.push(byte); + } + else if (byte < 0xE0) { + codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F)); + } + else if (byte < 0xF0) { + codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F)); + } + else { + var codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F); + codePoints.push(codePoint); + } + } + // then decode code points to utf-8 + return String.fromCodePoint.apply(String, codePoints); }; BorshDeserializer.prototype.decode_boolean = function () { return this.buffer.consume_value('u8') > 0; diff --git a/lib/cjs/serialize.js b/lib/cjs/serialize.js index 15a9428d..45cfc2b3 100644 --- a/lib/cjs/serialize.js +++ b/lib/cjs/serialize.js @@ -27,7 +27,6 @@ exports.BorshSerializer = void 0; var types_js_1 = require("./types.js"); var buffer_js_1 = require("./buffer.js"); var utils = __importStar(require("./utils.js")); -var utfUtil = __importStar(require("util")); var BorshSerializer = /** @class */ (function () { function BorshSerializer(checkTypes) { this.encoded = new buffer_js_1.EncodeBuffer(); @@ -84,11 +83,29 @@ var BorshSerializer = /** @class */ (function () { }; BorshSerializer.prototype.encode_string = function (value) { this.checkTypes && utils.expect_type(value, 'string', this.fieldPath); - // encode to utf8 bytes - var utf8Bytes = new utfUtil.TextEncoder().encode(value); + var _value = value; + // encode to utf8 bytes without using TextEncoder + var utf8Bytes = []; + for (var i = 0; i < _value.length; i++) { + var charCode = _value.charCodeAt(i); + if (charCode < 0x80) { + utf8Bytes.push(charCode); + } + else if (charCode < 0x800) { + utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f)); + } + else if (charCode < 0xd800 || charCode >= 0xe000) { + utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f)); + } + else { + i++; + charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff)); + utf8Bytes.push(0xf0 | (charCode >> 18), 0x80 | ((charCode >> 12) & 0x3f), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f)); + } + } // 4 bytes for length + string bytes this.encoded.store_value(utf8Bytes.length, 'u32'); - this.encoded.store_bytes(utf8Bytes); + this.encoded.store_bytes(new Uint8Array(utf8Bytes)); }; BorshSerializer.prototype.encode_boolean = function (value) { this.checkTypes && utils.expect_type(value, 'boolean', this.fieldPath); diff --git a/lib/cjs/utils.d.ts b/lib/cjs/utils.d.ts index 3047d77b..44a5a32c 100644 --- a/lib/cjs/utils.d.ts +++ b/lib/cjs/utils.d.ts @@ -8,3 +8,4 @@ export declare class ErrorSchema extends Error { constructor(schema: Schema, expected: string); } export declare function validate_schema(schema: Schema): void; +export declare function encodeCodePoint(codePoint: any): number[]; diff --git a/lib/cjs/utils.js b/lib/cjs/utils.js index 317783f9..fdcb6f67 100644 --- a/lib/cjs/utils.js +++ b/lib/cjs/utils.js @@ -15,7 +15,7 @@ var __extends = (this && this.__extends) || (function () { }; })(); exports.__esModule = true; -exports.validate_schema = exports.ErrorSchema = exports.expect_enum = exports.expect_same_size = exports.expect_bigint = exports.expect_type = exports.isArrayLike = void 0; +exports.encodeCodePoint = exports.validate_schema = exports.ErrorSchema = exports.expect_enum = exports.expect_same_size = exports.expect_bigint = exports.expect_type = exports.isArrayLike = void 0; var types_js_1 = require("./types.js"); function isArrayLike(value) { // source: https://stackoverflow.com/questions/24048547/checking-if-an-object-is-array-like @@ -132,3 +132,32 @@ function validate_struct_schema(schema) { validate_schema(schema[key]); } } +// utf-8 encode +function encodeCodePoint(codePoint) { + if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence + return [codePoint]; + } + var symbol = []; + if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence + symbol = [((codePoint >> 6) & 0x1F) | 0xC0]; + } + else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence + symbol = [ + ((codePoint >> 12) & 0x0F) | 0xE0, + createByte(codePoint, 6) + ]; + } + else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence + symbol = [ + ((codePoint >> 18) & 0x07) | 0xF0, + createByte(codePoint, 12), + createByte(codePoint, 6) + ]; + } + symbol.push((codePoint & 0x3F) | 0x80); + return symbol; +} +exports.encodeCodePoint = encodeCodePoint; +function createByte(codePoint, shift) { + return ((codePoint >> shift) & 0x3F) | 0x80; +} diff --git a/lib/esm/deserialize.js b/lib/esm/deserialize.js index 818b8cad..502a86fd 100644 --- a/lib/esm/deserialize.js +++ b/lib/esm/deserialize.js @@ -1,6 +1,5 @@ import { integers } from './types.js'; import { DecodeBuffer } from './buffer.js'; -import * as utfUtil from 'util'; var BorshDeserializer = /** @class */ (function () { function BorshDeserializer(bufferArray) { this.buffer = new DecodeBuffer(bufferArray); @@ -53,7 +52,27 @@ var BorshDeserializer = /** @class */ (function () { BorshDeserializer.prototype.decode_string = function () { var len = this.decode_integer('u32'); var buffer = new Uint8Array(this.buffer.consume_bytes(len)); - return new utfUtil.TextDecoder().decode(buffer); + // decode utf-8 string without using TextDecoder + // first get all bytes to single byte code points + var codePoints = []; + for (var i = 0; i < len; ++i) { + var byte = buffer[i]; + if (byte < 0x80) { + codePoints.push(byte); + } + else if (byte < 0xE0) { + codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F)); + } + else if (byte < 0xF0) { + codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F)); + } + else { + var codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F); + codePoints.push(codePoint); + } + } + // then decode code points to utf-8 + return String.fromCodePoint.apply(String, codePoints); }; BorshDeserializer.prototype.decode_boolean = function () { return this.buffer.consume_value('u8') > 0; diff --git a/lib/esm/serialize.js b/lib/esm/serialize.js index c76d7549..1e76225f 100644 --- a/lib/esm/serialize.js +++ b/lib/esm/serialize.js @@ -1,7 +1,6 @@ import { integers } from './types.js'; import { EncodeBuffer } from './buffer.js'; import * as utils from './utils.js'; -import * as utfUtil from 'util'; var BorshSerializer = /** @class */ (function () { function BorshSerializer(checkTypes) { this.encoded = new EncodeBuffer(); @@ -58,11 +57,29 @@ var BorshSerializer = /** @class */ (function () { }; BorshSerializer.prototype.encode_string = function (value) { this.checkTypes && utils.expect_type(value, 'string', this.fieldPath); - // encode to utf8 bytes - var utf8Bytes = new utfUtil.TextEncoder().encode(value); + var _value = value; + // encode to utf8 bytes without using TextEncoder + var utf8Bytes = []; + for (var i = 0; i < _value.length; i++) { + var charCode = _value.charCodeAt(i); + if (charCode < 0x80) { + utf8Bytes.push(charCode); + } + else if (charCode < 0x800) { + utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f)); + } + else if (charCode < 0xd800 || charCode >= 0xe000) { + utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f)); + } + else { + i++; + charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff)); + utf8Bytes.push(0xf0 | (charCode >> 18), 0x80 | ((charCode >> 12) & 0x3f), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f)); + } + } // 4 bytes for length + string bytes this.encoded.store_value(utf8Bytes.length, 'u32'); - this.encoded.store_bytes(utf8Bytes); + this.encoded.store_bytes(new Uint8Array(utf8Bytes)); }; BorshSerializer.prototype.encode_boolean = function (value) { this.checkTypes && utils.expect_type(value, 'boolean', this.fieldPath); diff --git a/lib/esm/utils.d.ts b/lib/esm/utils.d.ts index 3047d77b..44a5a32c 100644 --- a/lib/esm/utils.d.ts +++ b/lib/esm/utils.d.ts @@ -8,3 +8,4 @@ export declare class ErrorSchema extends Error { constructor(schema: Schema, expected: string); } export declare function validate_schema(schema: Schema): void; +export declare function encodeCodePoint(codePoint: any): number[]; diff --git a/lib/esm/utils.js b/lib/esm/utils.js index 6e722cbf..0c039b8d 100644 --- a/lib/esm/utils.js +++ b/lib/esm/utils.js @@ -123,3 +123,31 @@ function validate_struct_schema(schema) { validate_schema(schema[key]); } } +// utf-8 encode +export function encodeCodePoint(codePoint) { + if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence + return [codePoint]; + } + var symbol = []; + if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence + symbol = [((codePoint >> 6) & 0x1F) | 0xC0]; + } + else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence + symbol = [ + ((codePoint >> 12) & 0x0F) | 0xE0, + createByte(codePoint, 6) + ]; + } + else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence + symbol = [ + ((codePoint >> 18) & 0x07) | 0xF0, + createByte(codePoint, 12), + createByte(codePoint, 6) + ]; + } + symbol.push((codePoint & 0x3F) | 0x80); + return symbol; +} +function createByte(codePoint, shift) { + return ((codePoint >> shift) & 0x3F) | 0x80; +} diff --git a/lib/types/utils.d.ts b/lib/types/utils.d.ts index 3047d77b..44a5a32c 100644 --- a/lib/types/utils.d.ts +++ b/lib/types/utils.d.ts @@ -8,3 +8,4 @@ export declare class ErrorSchema extends Error { constructor(schema: Schema, expected: string); } export declare function validate_schema(schema: Schema): void; +export declare function encodeCodePoint(codePoint: any): number[];