Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(NODE-5363): defer byte slicing to utf8 decoding API in nodejs #585

Merged
merged 15 commits into from
Jul 3, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions etc/benchmarks/bson_versions.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
{
"versions": [
"1.1.6",
"4.6",
"5.0",
"5.1",
"5.2",
"5.3"
]
}
3 changes: 2 additions & 1 deletion etc/benchmarks/install_bson_versions.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash
versions=$(jq '.versions' < bson_versions.json | sed -E 's/(\[|\]|,|")//g')
# To be run from repo root
versions=$(jq '.versions' < etc/benchmarks/bson_versions.json | sed -E 's/(\[|\]|,|")//g')
installVersions=''
for bson in $versions; do
versionNoDot=$(echo $bson | tr -d '.')
Expand Down
55 changes: 55 additions & 0 deletions etc/benchmarks/main.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,61 @@ await runner({
}
});

await runner({
skip: true,
name: 'deserialize a large batch of documents each with an array of many Int32s',
iterations,
setup(libs) {
const bson = libs[0].lib;
return bson.serialize({
nextBatch: Array.from({ length: 1000 }, () => ({
_id: new bson.ObjectId(),
arrayField: Array.from({ length: 100 }, (_, i) => i)
}))
});
},
async run(i, bson, document) {
await Promise.all(
Array.from(
{ length: 100 },
(_, i) =>
new Promise(resolve => {
setTimeout(() => {
resolve(bson.lib.deserialize(document, { validation: { utf8: false } }));
}, 20);
})
)
);
}
});

await runner({
skip: true,
name: 'deserialize a large batch of documents each with an array of many Int64s',
iterations,
setup(libs) {
const bson = libs[0].lib;
return bson.serialize({
nextBatch: Array.from({ length: 1000 }, () => ({
_id: new bson.ObjectId(),
arrayField: Array.from({ length: 100 }, (_, i) => bson.Long.fromInt(i))
}))
});
},
async run(i, bson, document) {
await Promise.all(
Array.from(
{ length: 100 },
(_, i) =>
new Promise(resolve => {
setTimeout(() => {
resolve(bson.lib.deserialize(document, { validation: { utf8: false } }));
}, 20);
})
)
);
}
});
durran marked this conversation as resolved.
Show resolved Hide resolved
// End
console.log(
'Total time taken to benchmark:',
Expand Down
14 changes: 7 additions & 7 deletions src/parser/deserializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ function deserializeObject(
if (i >= buffer.byteLength) throw new BSONError('Bad BSON Document: illegal CString');

// Represents the key
const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer.subarray(index, i));
const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i);

// shouldValidateKey is true if the key should be validated, false otherwise
let shouldValidateKey = true;
Expand Down Expand Up @@ -476,7 +476,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const source = ByteUtils.toUTF8(buffer.subarray(index, i));
const source = ByteUtils.toUTF8(buffer, index, i);
// Create the regexp
index = i + 1;

Expand All @@ -489,7 +489,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const regExpOptions = ByteUtils.toUTF8(buffer.subarray(index, i));
const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
index = i + 1;

// For each option add the corresponding one for javascript
Expand Down Expand Up @@ -521,7 +521,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const source = ByteUtils.toUTF8(buffer.subarray(index, i));
const source = ByteUtils.toUTF8(buffer, index, i);
index = i + 1;

// Get the start search index
Expand All @@ -533,7 +533,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const regExpOptions = ByteUtils.toUTF8(buffer.subarray(index, i));
const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
index = i + 1;

// Set the object
Expand Down Expand Up @@ -678,7 +678,7 @@ function deserializeObject(
throw new BSONError('Invalid UTF-8 string in BSON document');
}
}
const namespace = ByteUtils.toUTF8(buffer.subarray(index, index + stringSize - 1));
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1);
// Update parse index position
index = index + stringSize;

Expand Down Expand Up @@ -735,7 +735,7 @@ function getValidatedString(
end: number,
shouldValidateUtf8: boolean
) {
const value = ByteUtils.toUTF8(buffer.subarray(start, end));
const value = ByteUtils.toUTF8(buffer, start, end);
// if utf8 validation is on, do the check
if (shouldValidateUtf8) {
for (let i = 0; i < value.length; i++) {
Expand Down
2 changes: 1 addition & 1 deletion src/utils/byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export type ByteUtils = {
/** Create a Uint8Array containing utf8 code units from a string */
fromUTF8: (text: string) => Uint8Array;
/** Create a string from utf8 code units */
toUTF8: (buffer: Uint8Array) => string;
toUTF8: (buffer: Uint8Array, start?: number, end?: number) => string;
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
/** Get the utf8 code unit count from a string if it were to be transformed to utf8 */
utf8ByteLength: (input: string) => number;
/** Encode UTF8 bytes generated from `source` string into `destination` at byteOffset. Returns the number of bytes encoded. */
Expand Down
6 changes: 3 additions & 3 deletions src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ type NodeJsBuffer = ArrayBufferView &
Uint8Array & {
write(string: string, offset: number, length: undefined, encoding: 'utf8'): number;
copy(target: Uint8Array, targetStart: number, sourceStart: number, sourceEnd: number): number;
toString: (this: Uint8Array, encoding: NodeJsEncoding) => string;
toString: (this: Uint8Array, encoding: NodeJsEncoding, start?: number, end?: number) => string;
equals: (this: Uint8Array, other: Uint8Array) => boolean;
};
type NodeJsBufferConstructor = Omit<Uint8ArrayConstructor, 'from'> & {
Expand Down Expand Up @@ -125,8 +125,8 @@ export const nodeJsByteUtils = {
return Buffer.from(text, 'utf8');
},

toUTF8(buffer: Uint8Array): string {
return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8');
toUTF8(buffer: Uint8Array, start: number, end: number): string {
return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
},

utf8ByteLength(input: string): number {
Expand Down
4 changes: 2 additions & 2 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ export const webByteUtils = {
return new TextEncoder().encode(text);
},

toUTF8(uint8array: Uint8Array): string {
return new TextDecoder('utf8', { fatal: false }).decode(uint8array);
toUTF8(uint8array: Uint8Array, start?: number, end?: number): string {
return new TextDecoder('utf8', { fatal: false }).decode(uint8array.slice(start, end));
},

utf8ByteLength(input: string): number {
Expand Down