Skip to content

Commit

Permalink
fix: utf8 without TextEncoder/Decoder
Browse files Browse the repository at this point in the history
  • Loading branch information
gagdiez committed Nov 22, 2023
1 parent 2d28b1a commit ea62645
Show file tree
Hide file tree
Showing 13 changed files with 222 additions and 47 deletions.
23 changes: 20 additions & 3 deletions borsh-ts/deserialize.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import { ArrayType, DecodeTypes, MapType, IntegerType, OptionType, Schema, SetType, StructType, integers, EnumType } from './types.js';
import { DecodeBuffer } from './buffer.js';

import * as utfUtil from 'util';

export class BorshDeserializer {
buffer: DecodeBuffer;

Expand Down Expand Up @@ -56,7 +54,26 @@ export class BorshDeserializer {
decode_string(): string {
const len: number = this.decode_integer('u32') as number;
const buffer = new Uint8Array(this.buffer.consume_bytes(len));
return new utfUtil.TextDecoder().decode(buffer);

// decode utf-8 string without using TextDecoder
// first get all bytes to single byte code points
const codePoints = [];
for (let i = 0; i < len; ++i) {
const byte = buffer[i];
if (byte < 0x80) {
codePoints.push(byte);
} else if (byte < 0xE0) {
codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F));
} else if (byte < 0xF0) {
codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F));
} else {
const codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F);
codePoints.push(codePoint);
}
}

// then decode code points to utf-8
return String.fromCodePoint(...codePoints);
}

decode_boolean(): boolean {
Expand Down
31 changes: 25 additions & 6 deletions borsh-ts/serialize.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ import { ArrayType, MapType, IntegerType, OptionType, Schema, SetType, StructTyp
import { EncodeBuffer } from './buffer.js';
import * as utils from './utils.js';

import * as utfUtil from 'util';

export class BorshSerializer {
encoded: EncodeBuffer;
fieldPath: string[];
Expand Down Expand Up @@ -63,13 +61,34 @@ export class BorshSerializer {

encode_string(value: unknown): void {
this.checkTypes && utils.expect_type(value, 'string', this.fieldPath);

// encode to utf8 bytes
const utf8Bytes = new utfUtil.TextEncoder().encode(value as string);
const _value = value as string;

// encode to utf8 bytes without using TextEncoder
const utf8Bytes: number[] = [];
for (let i = 0; i < _value.length; i++) {
let charCode = _value.charCodeAt(i);

if (charCode < 0x80) {
utf8Bytes.push(charCode);
} else if (charCode < 0x800) {
utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
} else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
} else {
i++;
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff));
utf8Bytes.push(
0xf0 | (charCode >> 18),
0x80 | ((charCode >> 12) & 0x3f),
0x80 | ((charCode >> 6) & 0x3f),
0x80 | (charCode & 0x3f),
);
}
}

// 4 bytes for length + string bytes
this.encoded.store_value(utf8Bytes.length, 'u32');
this.encoded.store_bytes(utf8Bytes);
this.encoded.store_bytes(new Uint8Array(utf8Bytes));
}

encode_boolean(value: unknown): void {
Expand Down
4 changes: 2 additions & 2 deletions borsh-ts/test/(de)serialize.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ test('serialize booleans', async () => {
test('serialize strings', async () => {
check_roundtrip('h"i', 'string', [3, 0, 0, 0, 104, 34, 105]);
check_roundtrip('Chévere', 'string', [8, 0, 0, 0, 67, 104, 195, 169, 118, 101, 114, 101]);
check_roundtrip('👍', 'string', [4, 0, 0, 0, 240, 159, 145, 141]);
check_roundtrip('óñ', 'string', [8, 0, 0, 0, 195, 179, 195, 177, 32, 230, 188, 162]);
check_roundtrip('!ǬЇЉي࠺👍ઠ൧࿄ሒᘻᏠᬅᡝ࠻', 'string', [43, 0, 0, 0, 33, 199, 172, 208, 135, 208, 137, 217, 138, 224, 160, 186, 240, 159, 145, 141, 224, 170, 160, 224, 181, 167, 224, 191, 132, 225, 136, 146, 225, 152, 187, 225, 143, 160, 225, 172, 133, 225, 161, 157, 224, 160, 187]);
check_roundtrip('óñ@‡؏ث 漢࠶⭐🔒􀀀', 'string', [30, 0, 0, 0, 195, 179, 195, 177, 64, 226, 128, 161, 216, 143, 216, 171, 32, 230, 188, 162, 224, 160, 182, 226, 173, 144, 240, 159, 148, 146, 244, 128, 128, 128]);
});

test('serialize floats', async () => {
Expand Down
30 changes: 30 additions & 0 deletions borsh-ts/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,34 @@ function validate_struct_schema(schema: { [key: string]: Schema }): void {
for (const key in schema) {
validate_schema(schema[key]);
}
}

// utf-8 encode
export function encodeCodePoint(codePoint): number[] {
if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
return [codePoint];
}
let symbol: number[] = [];
if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence
symbol = [((codePoint >> 6) & 0x1F) | 0xC0];
}
else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
symbol = [
((codePoint >> 12) & 0x0F) | 0xE0,
createByte(codePoint, 6)
];
}
else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence
symbol = [
((codePoint >> 18) & 0x07) | 0xF0,
createByte(codePoint, 12),
createByte(codePoint, 6)
];
}
symbol.push((codePoint & 0x3F) | 0x80);
return symbol;
}

function createByte(codePoint, shift): number {
return ((codePoint >> shift) & 0x3F) | 0x80;
}
46 changes: 21 additions & 25 deletions lib/cjs/deserialize.js
Original file line number Diff line number Diff line change
@@ -1,32 +1,8 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
exports.__esModule = true;
exports.BorshDeserializer = void 0;
var types_js_1 = require("./types.js");
var buffer_js_1 = require("./buffer.js");
var utfUtil = __importStar(require("util"));
var BorshDeserializer = /** @class */ (function () {
function BorshDeserializer(bufferArray) {
this.buffer = new buffer_js_1.DecodeBuffer(bufferArray);
Expand Down Expand Up @@ -79,7 +55,27 @@ var BorshDeserializer = /** @class */ (function () {
BorshDeserializer.prototype.decode_string = function () {
var len = this.decode_integer('u32');
var buffer = new Uint8Array(this.buffer.consume_bytes(len));
return new utfUtil.TextDecoder().decode(buffer);
// decode utf-8 string without using TextDecoder
// first get all bytes to single byte code points
var codePoints = [];
for (var i = 0; i < len; ++i) {
var byte = buffer[i];
if (byte < 0x80) {
codePoints.push(byte);
}
else if (byte < 0xE0) {
codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F));
}
else if (byte < 0xF0) {
codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F));
}
else {
var codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F);
codePoints.push(codePoint);
}
}
// then decode code points to utf-8
return String.fromCodePoint.apply(String, codePoints);
};
BorshDeserializer.prototype.decode_boolean = function () {
return this.buffer.consume_value('u8') > 0;
Expand Down
25 changes: 21 additions & 4 deletions lib/cjs/serialize.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ exports.BorshSerializer = void 0;
var types_js_1 = require("./types.js");
var buffer_js_1 = require("./buffer.js");
var utils = __importStar(require("./utils.js"));
var utfUtil = __importStar(require("util"));
var BorshSerializer = /** @class */ (function () {
function BorshSerializer(checkTypes) {
this.encoded = new buffer_js_1.EncodeBuffer();
Expand Down Expand Up @@ -84,11 +83,29 @@ var BorshSerializer = /** @class */ (function () {
};
BorshSerializer.prototype.encode_string = function (value) {
this.checkTypes && utils.expect_type(value, 'string', this.fieldPath);
// encode to utf8 bytes
var utf8Bytes = new utfUtil.TextEncoder().encode(value);
var _value = value;
// encode to utf8 bytes without using TextEncoder
var utf8Bytes = [];
for (var i = 0; i < _value.length; i++) {
var charCode = _value.charCodeAt(i);
if (charCode < 0x80) {
utf8Bytes.push(charCode);
}
else if (charCode < 0x800) {
utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
}
else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
}
else {
i++;
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff));
utf8Bytes.push(0xf0 | (charCode >> 18), 0x80 | ((charCode >> 12) & 0x3f), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
}
}
// 4 bytes for length + string bytes
this.encoded.store_value(utf8Bytes.length, 'u32');
this.encoded.store_bytes(utf8Bytes);
this.encoded.store_bytes(new Uint8Array(utf8Bytes));
};
BorshSerializer.prototype.encode_boolean = function (value) {
this.checkTypes && utils.expect_type(value, 'boolean', this.fieldPath);
Expand Down
1 change: 1 addition & 0 deletions lib/cjs/utils.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ export declare class ErrorSchema extends Error {
constructor(schema: Schema, expected: string);
}
export declare function validate_schema(schema: Schema): void;
export declare function encodeCodePoint(codePoint: any): number[];
31 changes: 30 additions & 1 deletion lib/cjs/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ var __extends = (this && this.__extends) || (function () {
};
})();
exports.__esModule = true;
exports.validate_schema = exports.ErrorSchema = exports.expect_enum = exports.expect_same_size = exports.expect_bigint = exports.expect_type = exports.isArrayLike = void 0;
exports.encodeCodePoint = exports.validate_schema = exports.ErrorSchema = exports.expect_enum = exports.expect_same_size = exports.expect_bigint = exports.expect_type = exports.isArrayLike = void 0;
var types_js_1 = require("./types.js");
function isArrayLike(value) {
// source: https://stackoverflow.com/questions/24048547/checking-if-an-object-is-array-like
Expand Down Expand Up @@ -132,3 +132,32 @@ function validate_struct_schema(schema) {
validate_schema(schema[key]);
}
}
// utf-8 encode
function encodeCodePoint(codePoint) {
if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
return [codePoint];
}
var symbol = [];
if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence
symbol = [((codePoint >> 6) & 0x1F) | 0xC0];
}
else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
symbol = [
((codePoint >> 12) & 0x0F) | 0xE0,
createByte(codePoint, 6)
];
}
else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence
symbol = [
((codePoint >> 18) & 0x07) | 0xF0,
createByte(codePoint, 12),
createByte(codePoint, 6)
];
}
symbol.push((codePoint & 0x3F) | 0x80);
return symbol;
}
exports.encodeCodePoint = encodeCodePoint;
function createByte(codePoint, shift) {
return ((codePoint >> shift) & 0x3F) | 0x80;
}
23 changes: 21 additions & 2 deletions lib/esm/deserialize.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { integers } from './types.js';
import { DecodeBuffer } from './buffer.js';
import * as utfUtil from 'util';
var BorshDeserializer = /** @class */ (function () {
function BorshDeserializer(bufferArray) {
this.buffer = new DecodeBuffer(bufferArray);
Expand Down Expand Up @@ -53,7 +52,27 @@ var BorshDeserializer = /** @class */ (function () {
BorshDeserializer.prototype.decode_string = function () {
var len = this.decode_integer('u32');
var buffer = new Uint8Array(this.buffer.consume_bytes(len));
return new utfUtil.TextDecoder().decode(buffer);
// decode utf-8 string without using TextDecoder
// first get all bytes to single byte code points
var codePoints = [];
for (var i = 0; i < len; ++i) {
var byte = buffer[i];
if (byte < 0x80) {
codePoints.push(byte);
}
else if (byte < 0xE0) {
codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F));
}
else if (byte < 0xF0) {
codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F));
}
else {
var codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F);
codePoints.push(codePoint);
}
}
// then decode code points to utf-8
return String.fromCodePoint.apply(String, codePoints);
};
BorshDeserializer.prototype.decode_boolean = function () {
return this.buffer.consume_value('u8') > 0;
Expand Down
25 changes: 21 additions & 4 deletions lib/esm/serialize.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { integers } from './types.js';
import { EncodeBuffer } from './buffer.js';
import * as utils from './utils.js';
import * as utfUtil from 'util';
var BorshSerializer = /** @class */ (function () {
function BorshSerializer(checkTypes) {
this.encoded = new EncodeBuffer();
Expand Down Expand Up @@ -58,11 +57,29 @@ var BorshSerializer = /** @class */ (function () {
};
BorshSerializer.prototype.encode_string = function (value) {
this.checkTypes && utils.expect_type(value, 'string', this.fieldPath);
// encode to utf8 bytes
var utf8Bytes = new utfUtil.TextEncoder().encode(value);
var _value = value;
// encode to utf8 bytes without using TextEncoder
var utf8Bytes = [];
for (var i = 0; i < _value.length; i++) {
var charCode = _value.charCodeAt(i);
if (charCode < 0x80) {
utf8Bytes.push(charCode);
}
else if (charCode < 0x800) {
utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
}
else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
}
else {
i++;
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff));
utf8Bytes.push(0xf0 | (charCode >> 18), 0x80 | ((charCode >> 12) & 0x3f), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
}
}
// 4 bytes for length + string bytes
this.encoded.store_value(utf8Bytes.length, 'u32');
this.encoded.store_bytes(utf8Bytes);
this.encoded.store_bytes(new Uint8Array(utf8Bytes));
};
BorshSerializer.prototype.encode_boolean = function (value) {
this.checkTypes && utils.expect_type(value, 'boolean', this.fieldPath);
Expand Down
1 change: 1 addition & 0 deletions lib/esm/utils.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ export declare class ErrorSchema extends Error {
constructor(schema: Schema, expected: string);
}
export declare function validate_schema(schema: Schema): void;
export declare function encodeCodePoint(codePoint: any): number[];
28 changes: 28 additions & 0 deletions lib/esm/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,31 @@ function validate_struct_schema(schema) {
validate_schema(schema[key]);
}
}
// utf-8 encode
export function encodeCodePoint(codePoint) {
if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
return [codePoint];
}
var symbol = [];
if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence
symbol = [((codePoint >> 6) & 0x1F) | 0xC0];
}
else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
symbol = [
((codePoint >> 12) & 0x0F) | 0xE0,
createByte(codePoint, 6)
];
}
else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence
symbol = [
((codePoint >> 18) & 0x07) | 0xF0,
createByte(codePoint, 12),
createByte(codePoint, 6)
];
}
symbol.push((codePoint & 0x3F) | 0x80);
return symbol;
}
function createByte(codePoint, shift) {
return ((codePoint >> shift) & 0x3F) | 0x80;
}
Loading

0 comments on commit ea62645

Please sign in to comment.