Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

src: use simdutf for converting externalized builtins to UTF-16 #46119

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2024,11 +2024,7 @@ def make_bin_override():
for builtin in shareable_builtins:
builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path'
if getattr(options, builtin_id):
if options.with_intl == 'none':
option_name = '--shared-builtin-' + builtin + '-path'
error(option_name + ' is incompatible with --with-intl=none' )
else:
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
else:
output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]]

Expand Down
1 change: 1 addition & 0 deletions node.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,7 @@
'deps/googletest/googletest.gyp:gtest_main',
'deps/histogram/histogram.gyp:histogram',
'deps/uvwasi/uvwasi.gyp:uvwasi',
'deps/simdutf/simdutf.gyp:simdutf',
],

'includes': [
Expand Down
15 changes: 2 additions & 13 deletions src/api/environment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -467,21 +467,10 @@ MaybeLocal<Value> LoadEnvironment(
Environment* env,
const char* main_script_source_utf8) {
CHECK_NOT_NULL(main_script_source_utf8);
Isolate* isolate = env->isolate();
return LoadEnvironment(
env,
[&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
// This is a slightly hacky way to convert UTF-8 to UTF-16.
Local<String> str =
String::NewFromUtf8(isolate,
main_script_source_utf8).ToLocalChecked();
auto main_utf16 = std::make_unique<String::Value>(isolate, str);

// TODO(addaleax): Avoid having a global table for all scripts.
env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
std::string name = "embedder_main_" + std::to_string(env->thread_id());
builtins::BuiltinLoader::Add(
name.c_str(), UnionBytes(**main_utf16, main_utf16->length()));
env->set_main_utf16(std::move(main_utf16));
builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8);
Realm* realm = env->principal_realm();

return realm->ExecuteBootstrapper(name.c_str());
Expand Down
5 changes: 0 additions & 5 deletions src/env-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -805,11 +805,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) {
cleanup_queue_.Remove(fn, arg);
}

void Environment::set_main_utf16(std::unique_ptr<v8::String::Value> str) {
CHECK(!main_utf16_);
main_utf16_ = std::move(str);
}

void Environment::set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler) {
process_exit_handler_ = std::move(handler);
Expand Down
6 changes: 0 additions & 6 deletions src/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,6 @@ class Environment : public MemoryRetainer {

#endif // HAVE_INSPECTOR

inline void set_main_utf16(std::unique_ptr<v8::String::Value>);
inline void set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler);

Expand Down Expand Up @@ -1144,11 +1143,6 @@ class Environment : public MemoryRetainer {

std::unique_ptr<Realm> principal_realm_ = nullptr;

// Keeps the main script source alive is one was passed to LoadEnvironment().
// We should probably find a way to just use plain `v8::String`s created from
// the source passed to LoadEnvironment() directly instead.
std::unique_ptr<v8::String::Value> main_utf16_;

// Used by allocate_managed_buffer() and release_managed_buffer() to keep
// track of the BackingStore for a given pointer.
std::unordered_map<char*, std::unique_ptr<v8::BackingStore>>
Expand Down
26 changes: 14 additions & 12 deletions src/node_builtins.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "env-inl.h"
#include "node_external_reference.h"
#include "node_internals.h"
#include "simdutf.h"
#include "util-inl.h"

namespace node {
Expand Down Expand Up @@ -35,7 +36,6 @@ BuiltinLoader BuiltinLoader::instance_;

BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
LoadJavaScriptSource();
#if defined(NODE_HAVE_I18N_SUPPORT)
#ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH
AddExternalizedBuiltin(
"internal/deps/cjs-module-lexer/lexer",
Expand All @@ -52,7 +52,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
AddExternalizedBuiltin("internal/deps/undici/undici",
STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH));
#endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH
#endif // NODE_HAVE_I18N_SUPPORT
}

BuiltinLoader* BuiltinLoader::GetInstance() {
Expand Down Expand Up @@ -240,7 +239,6 @@ MaybeLocal<String> BuiltinLoader::LoadBuiltinSource(Isolate* isolate,
#endif // NODE_BUILTIN_MODULES_PATH
}

#if defined(NODE_HAVE_I18N_SUPPORT)
void BuiltinLoader::AddExternalizedBuiltin(const char* id,
const char* filename) {
std::string source;
Expand All @@ -252,16 +250,20 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id,
return;
}

icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8(
icu::StringPiece(source.data(), source.length()));
auto source_utf16 = std::make_unique<icu::UnicodeString>(utf16);
Add(id,
UnionBytes(reinterpret_cast<const uint16_t*>((*source_utf16).getBuffer()),
utf16.length()));
// keep source bytes for builtin alive while BuiltinLoader exists
GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16));
Add(id, source);
}

bool BuiltinLoader::Add(const char* id, std::string_view utf8source) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can just create a UnionBytes constructor that takes a std::string_view that does this? (then I guess that UnionBytes can own the vector?))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, UnionBytes that supports storing UTF-8 directly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fwiw, I’ll merge this in order to unblock #45942, but I’m still happy to iterate further here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A union for everything ;)

size_t expected_u16_length =
simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length());
auto out = std::make_shared<std::vector<uint16_t>>(expected_u16_length);
size_t u16_length = simdutf::convert_utf8_to_utf16le(
utf8source.data(),
utf8source.length(),
reinterpret_cast<char16_t*>(out->data()));
out->resize(u16_length);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this bit really necessary?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What kind of alternative would you suggest? Or is this about the resize() call?

return Add(id, UnionBytes(out));
}
#endif // NODE_HAVE_I18N_SUPPORT

// Returns Local<Function> of the compiled module if return_code_cache
// is false (we are only compiling the function).
Expand Down
9 changes: 1 addition & 8 deletions src/node_builtins.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
#include <memory>
#include <set>
#include <string>
#if defined(NODE_HAVE_I18N_SUPPORT)
#include <unicode/unistr.h>
#endif // NODE_HAVE_I18N_SUPPORT
#include <vector>
#include "node_mutex.h"
#include "node_union_bytes.h"
Expand Down Expand Up @@ -71,6 +68,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static v8::Local<v8::String> GetConfigString(v8::Isolate* isolate);
static bool Exists(const char* id);
static bool Add(const char* id, const UnionBytes& source);
static bool Add(const char* id, std::string_view utf8source);

static bool CompileAllBuiltins(v8::Local<v8::Context> context);
static void RefreshCodeCache(const std::vector<CodeCacheInfo>& in);
Expand Down Expand Up @@ -136,18 +134,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static void HasCachedBuiltins(
const v8::FunctionCallbackInfo<v8::Value>& args);

#if defined(NODE_HAVE_I18N_SUPPORT)
static void AddExternalizedBuiltin(const char* id, const char* filename);
#endif // NODE_HAVE_I18N_SUPPORT

static BuiltinLoader instance_;
BuiltinCategories builtin_categories_;
BuiltinSourceMap source_;
BuiltinCodeCacheMap code_cache_;
UnionBytes config_;
#if defined(NODE_HAVE_I18N_SUPPORT)
std::list<std::unique_ptr<icu::UnicodeString>> externalized_source_bytes_;
#endif // NODE_HAVE_I18N_SUPPORT

// Used to synchronize access to the code cache map
Mutex code_cache_mutex_;
Expand Down
64 changes: 9 additions & 55 deletions src/node_union_bytes.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,57 +11,20 @@

namespace node {

class NonOwningExternalOneByteResource
: public v8::String::ExternalOneByteStringResource {
public:
explicit NonOwningExternalOneByteResource(const uint8_t* data, size_t length)
: data_(data), length_(length) {}
~NonOwningExternalOneByteResource() override = default;

const char* data() const override {
return reinterpret_cast<const char*>(data_);
}
size_t length() const override { return length_; }

NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
delete;
NonOwningExternalOneByteResource& operator=(
const NonOwningExternalOneByteResource&) = delete;

private:
const uint8_t* data_;
size_t length_;
};

class NonOwningExternalTwoByteResource
: public v8::String::ExternalStringResource {
public:
explicit NonOwningExternalTwoByteResource(const uint16_t* data, size_t length)
: data_(data), length_(length) {}
~NonOwningExternalTwoByteResource() override = default;

const uint16_t* data() const override { return data_; }
size_t length() const override { return length_; }

NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
delete;
NonOwningExternalTwoByteResource& operator=(
const NonOwningExternalTwoByteResource&) = delete;

private:
const uint16_t* data_;
size_t length_;
};

// Similar to a v8::String, but it's independent from Isolates
// and can be materialized in Isolates as external Strings
// via ToStringChecked. The data pointers are owned by the caller.
// via ToStringChecked.
class UnionBytes {
public:
UnionBytes(const uint16_t* data, size_t length)
: one_bytes_(nullptr), two_bytes_(data), length_(length) {}
UnionBytes(const uint8_t* data, size_t length)
: one_bytes_(data), two_bytes_(nullptr), length_(length) {}
template <typename T> // T = uint8_t or uint16_t
explicit UnionBytes(std::shared_ptr<std::vector</*const*/ T>> data)
: UnionBytes(data->data(), data->size()) {
owning_ptr_ = data;
}

UnionBytes(const UnionBytes&) = default;
UnionBytes& operator=(const UnionBytes&) = default;
Expand All @@ -77,23 +40,14 @@ class UnionBytes {
CHECK_NOT_NULL(one_bytes_);
return one_bytes_;
}
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const {
if (is_one_byte()) {
NonOwningExternalOneByteResource* source =
new NonOwningExternalOneByteResource(one_bytes_data(), length_);
return v8::String::NewExternalOneByte(isolate, source).ToLocalChecked();
} else {
NonOwningExternalTwoByteResource* source =
new NonOwningExternalTwoByteResource(two_bytes_data(), length_);
return v8::String::NewExternalTwoByte(isolate, source).ToLocalChecked();
}
}
size_t length() { return length_; }
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const;
size_t length() const { return length_; }

private:
const uint8_t* one_bytes_;
const uint16_t* two_bytes_;
size_t length_;
std::shared_ptr<void> owning_ptr_;
};

} // namespace node
Expand Down
62 changes: 62 additions & 0 deletions src/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -515,4 +515,66 @@ void SetConstructorFunction(Isolate* isolate,
that->Set(name, tmpl);
}

namespace {

class NonOwningExternalOneByteResource
: public v8::String::ExternalOneByteStringResource {
public:
explicit NonOwningExternalOneByteResource(const UnionBytes& source)
: source_(source) {}
~NonOwningExternalOneByteResource() override = default;

const char* data() const override {
return reinterpret_cast<const char*>(source_.one_bytes_data());
}
size_t length() const override { return source_.length(); }

NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
delete;
NonOwningExternalOneByteResource& operator=(
const NonOwningExternalOneByteResource&) = delete;

private:
const UnionBytes source_;
};

class NonOwningExternalTwoByteResource
: public v8::String::ExternalStringResource {
public:
explicit NonOwningExternalTwoByteResource(const UnionBytes& source)
: source_(source) {}
~NonOwningExternalTwoByteResource() override = default;

const uint16_t* data() const override { return source_.two_bytes_data(); }
size_t length() const override { return source_.length(); }

NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
delete;
NonOwningExternalTwoByteResource& operator=(
const NonOwningExternalTwoByteResource&) = delete;

private:
const UnionBytes source_;
};

} // anonymous namespace

Local<String> UnionBytes::ToStringChecked(Isolate* isolate) const {
if (UNLIKELY(length() == 0)) {
// V8 requires non-null data pointers for empty external strings,
// but we don't guarantee that. Solve this by not creating an
// external string at all in that case.
return String::Empty(isolate);
}
if (is_one_byte()) {
NonOwningExternalOneByteResource* source =
new NonOwningExternalOneByteResource(*this);
return String::NewExternalOneByte(isolate, source).ToLocalChecked();
} else {
NonOwningExternalTwoByteResource* source =
new NonOwningExternalTwoByteResource(*this);
return String::NewExternalTwoByte(isolate, source).ToLocalChecked();
}
}

} // namespace node
17 changes: 16 additions & 1 deletion test/cctest/test_util.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "util-inl.h"
#include "debug_utils-inl.h"
#include "env-inl.h"
#include "gtest/gtest.h"
#include "simdutf.h"
#include "util-inl.h"

using node::Calloc;
using node::Malloc;
Expand Down Expand Up @@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) {
const std::string with_zero = std::string("a") + '\0' + 'b';
EXPECT_EQ(SPrintF("%s", with_zero), with_zero);
}

TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) {
// In simdutf, "LE" does *not* refer to Little Endian, it refers
anonrig marked this conversation as resolved.
Show resolved Hide resolved
// to 16-byte code units that are stored using *host* endianness.
// This is weird and confusing naming, and so we add this assertion
// here to verify that this is actually the case (so that CI tells
// us if it changed, because for most people Little Endian is
// host endianness, so locally everything would work fine).
const char utf8source[] = "\xe7\x8c\xab";
char16_t u16output;
size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output);
EXPECT_EQ(u16len, 1u);
EXPECT_EQ(u16output, 0x732B);
}