From 13c8029889d7b80fccfc83915aa683ef7d0af651 Mon Sep 17 00:00:00 2001 From: James M Snell Date: Wed, 19 Jun 2024 07:10:27 -0700 Subject: [PATCH] src, deps: move string_search.h to nbytes, add version metadata --- deps/nbytes/README.md | 5 + deps/nbytes/nbytes.cpp | 6 +- deps/nbytes/nbytes.h | 632 ++++++++++++++++++++++++ node.gyp | 1 - src/node_buffer.cc | 80 ++-- src/node_metadata.cc | 2 + src/node_metadata.h | 1 + src/string_search.h | 638 ------------------------- test/parallel/test-process-versions.js | 2 + 9 files changed, 685 insertions(+), 682 deletions(-) create mode 100644 deps/nbytes/README.md delete mode 100644 src/string_search.h diff --git a/deps/nbytes/README.md b/deps/nbytes/README.md new file mode 100644 index 00000000000000..9ff412adb30560 --- /dev/null +++ b/deps/nbytes/README.md @@ -0,0 +1,5 @@ +# Node.js bytes (nbytes) library + +The `nbytes` library extracts certain Node.js specific byte manipulation +functions from the core of Node.js itself and makes them available for +use in other projects that need to emulate Node.js' behavior. diff --git a/deps/nbytes/nbytes.cpp b/deps/nbytes/nbytes.cpp index 565e31646395db..4eb4b6e6aa72a8 100644 --- a/deps/nbytes/nbytes.cpp +++ b/deps/nbytes/nbytes.cpp @@ -56,7 +56,7 @@ bool SwapBytes16(void* data, size_t nbytes) { for (size_t i = 0; i < len16; i++) { data16[i] = BSWAP_2(data16[i]); } - return; + return true; } #endif @@ -82,7 +82,7 @@ bool SwapBytes32(void* data, size_t nbytes) { for (size_t i = 0; i < len32; i++) { data32[i] = BSWAP_4(data32[i]); } - return; + return true; } #endif @@ -108,7 +108,7 @@ bool SwapBytes64(void* data, size_t nbytes) { for (size_t i = 0; i < len64; i++) { data64[i] = BSWAP_8(data64[i]); } - return; + return true; } #endif diff --git a/deps/nbytes/nbytes.h b/deps/nbytes/nbytes.h index acb0f1bedf1549..7af92ddc3d538e 100644 --- a/deps/nbytes/nbytes.h +++ b/deps/nbytes/nbytes.h @@ -1,8 +1,10 @@ #pragma once +#include #include #include #include +#include #include namespace nbytes { @@ -39,6 +41,15 @@ namespace nbytes { #define NBYTES_ASSERT_TRUE(COND) #endif +[[noreturn]] inline void unreachable() { +#ifdef __GNUC__ + __builtin_unreachable(); +#elif defined(_MSC_VER) + __assume(false); +#else +#endif +} + // The nbytes (short for "node bytes") is a set of utility helpers for // working with bytes that are extracted from Node.js' internals. The // motivation for extracting these into a separate library is to make it @@ -58,6 +69,12 @@ constexpr T* AlignUp(T* ptr, U alignment) { RoundUp(reinterpret_cast(ptr), alignment)); } +template +inline T AlignDown(T value, U alignment) { + return reinterpret_cast( + (reinterpret_cast(value) & ~(alignment - 1))); +} + template inline T MultiplyWithOverflowCheck(T a, T b) { auto ret = a * b; @@ -235,4 +252,619 @@ size_t HexEncode( std::string HexEncode(const char* src, size_t slen); +// ============================================================================ +// StringSearch + +namespace stringsearch { + +template +class Vector { + public: + Vector(T* data, size_t length, bool isForward) + : start_(data), length_(length), is_forward_(isForward) { + CHECK(length > 0 && data != nullptr); + } + + // Returns the start of the memory range. + // For vector v this is NOT necessarily &v[0], see forward(). + const T* start() const { return start_; } + + // Returns the length of the vector, in characters. + size_t length() const { return length_; } + + // Returns true if the Vector is front-to-back, false if back-to-front. + // In the latter case, v[0] corresponds to the *end* of the memory range. + bool forward() const { return is_forward_; } + + // Access individual vector elements - checks bounds in debug mode. + T& operator[](size_t index) const { + NBYTES_ASSERT_TRUE(index < length_); + return start_[is_forward_ ? index : (length_ - index - 1)]; + } + + private: + T* start_; + size_t length_; + bool is_forward_; +}; + +//--------------------------------------------------------------------- +// String Search object. +//--------------------------------------------------------------------- + +// Class holding constants and methods that apply to all string search variants, +// independently of subject and pattern char size. +class StringSearchBase { + protected: + // Cap on the maximal shift in the Boyer-Moore implementation. By setting a + // limit, we can fix the size of tables. For a needle longer than this limit, + // search will not be optimal, since we only build tables for a suffix + // of the string, but it is a safe approximation. + static const int kBMMaxShift = 250; + + // Reduce alphabet to this size. + // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size + // proportional to the input alphabet. We reduce the alphabet size by + // equating input characters modulo a smaller alphabet size. This gives + // a potentially less efficient searching, but is a safe approximation. + // For needles using only characters in the same Unicode 256-code point page, + // there is no search speed degradation. + static const int kLatin1AlphabetSize = 256; + static const int kUC16AlphabetSize = 256; + + // Bad-char shift table stored in the state. It's length is the alphabet size. + // For patterns below this length, the skip length of Boyer-Moore is too short + // to compensate for the algorithmic overhead compared to simple brute force. + static const int kBMMinPatternLength = 8; + + // Store for the BoyerMoore(Horspool) bad char shift table. + int bad_char_shift_table_[kUC16AlphabetSize]; + // Store for the BoyerMoore good suffix shift table. + int good_suffix_shift_table_[kBMMaxShift + 1]; + // Table used temporarily while building the BoyerMoore good suffix + // shift table. + int suffix_table_[kBMMaxShift + 1]; +}; + +template +class StringSearch : private StringSearchBase { + public: + typedef stringsearch::Vector Vector; + + explicit StringSearch(Vector pattern) + : pattern_(pattern), start_(0) { + if (pattern.length() >= kBMMaxShift) { + start_ = pattern.length() - kBMMaxShift; + } + + size_t pattern_length = pattern_.length(); + NBYTES_ASSERT_TRUE(pattern_length > 0); + if (pattern_length < kBMMinPatternLength) { + if (pattern_length == 1) { + strategy_ = SearchStrategy::kSingleChar; + return; + } + strategy_ = SearchStrategy::kLinear; + return; + } + strategy_ = SearchStrategy::kInitial; + } + + size_t Search(Vector subject, size_t index) { + switch (strategy_) { + case kBoyerMooreHorspool: + return BoyerMooreHorspoolSearch(subject, index); + case kBoyerMoore: + return BoyerMooreSearch(subject, index); + case kInitial: + return InitialSearch(subject, index); + case kLinear: + return LinearSearch(subject, index); + case kSingleChar: + return SingleCharSearch(subject, index); + } + unreachable(); + } + + static inline int AlphabetSize() { + if (sizeof(Char) == 1) { + // Latin1 needle. + return kLatin1AlphabetSize; + } else { + // UC16 needle. + return kUC16AlphabetSize; + } + + static_assert(sizeof(Char) == sizeof(uint8_t) || + sizeof(Char) == sizeof(uint16_t), + "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); + } + + private: + typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); + size_t SingleCharSearch(Vector subject, size_t start_index); + size_t LinearSearch(Vector subject, size_t start_index); + size_t InitialSearch(Vector subject, size_t start_index); + size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); + size_t BoyerMooreSearch(Vector subject, size_t start_index); + + void PopulateBoyerMooreHorspoolTable(); + + void PopulateBoyerMooreTable(); + + static inline int CharOccurrence(int* bad_char_occurrence, + Char char_code) { + if (sizeof(Char) == 1) { + return bad_char_occurrence[static_cast(char_code)]; + } + // Both pattern and subject are UC16. Reduce character to equivalence class. + int equiv_class = char_code % kUC16AlphabetSize; + return bad_char_occurrence[equiv_class]; + } + + enum SearchStrategy { + kBoyerMooreHorspool, + kBoyerMoore, + kInitial, + kLinear, + kSingleChar, + }; + + // The pattern to search for. + Vector pattern_; + SearchStrategy strategy_; + // Cache value of Max(0, pattern_length() - kBMMaxShift) + size_t start_; +}; + +inline uint8_t GetHighestValueByte(uint16_t character) { + return std::max(static_cast(character & 0xFF), + static_cast(character >> 8)); +} + +inline uint8_t GetHighestValueByte(uint8_t character) { return character; } + +// Searches for a byte value in a memory buffer, back to front. +// Uses memrchr(3) on systems which support it, for speed. +// Falls back to a vanilla for loop on non-GNU systems such as Windows. +inline const void* MemrchrFill(const void* haystack, uint8_t needle, + size_t haystack_len) { +#ifdef _GNU_SOURCE + return memrchr(haystack, needle, haystack_len); +#else + const uint8_t* haystack8 = static_cast(haystack); + for (size_t i = haystack_len - 1; i != static_cast(-1); i--) { + if (haystack8[i] == needle) { + return haystack8 + i; + } + } + return nullptr; +#endif +} + +// Finds the first occurrence of *two-byte* character pattern[0] in the string +// `subject`. Does not check that the whole pattern matches. +template +inline size_t FindFirstCharacter(Vector pattern, + Vector subject, size_t index) { + const Char pattern_first_char = pattern[0]; + const size_t max_n = (subject.length() - pattern.length() + 1); + + // For speed, search for the more `rare` of the two bytes in pattern[0] + // using memchr / memrchr (which are much faster than a simple for loop). + const uint8_t search_byte = GetHighestValueByte(pattern_first_char); + size_t pos = index; + do { + const size_t bytes_to_search = (max_n - pos) * sizeof(Char); + const void* void_pos; + if (subject.forward()) { + // Assert that bytes_to_search won't overflow + NBYTES_ASSERT_TRUE(pos <= max_n); + NBYTES_ASSERT_TRUE(max_n - pos <= SIZE_MAX / sizeof(Char)); + void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); + } else { + NBYTES_ASSERT_TRUE(pos <= subject.length()); + NBYTES_ASSERT_TRUE(subject.length() - pos <= SIZE_MAX / sizeof(Char)); + void_pos = MemrchrFill(subject.start() + pattern.length() - 1, + search_byte, + bytes_to_search); + } + const Char* char_pos = static_cast(void_pos); + if (char_pos == nullptr) + return subject.length(); + + // Then, for each match, verify that the full two bytes match pattern[0]. + char_pos = AlignDown(char_pos, sizeof(Char)); + size_t raw_pos = static_cast(char_pos - subject.start()); + pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); + if (subject[pos] == pattern_first_char) { + // Match found, hooray. + return pos; + } + // Search byte matched, but the other byte of pattern[0] didn't. Keep going. + } while (++pos < max_n); + + return subject.length(); +} + +// Finds the first occurrence of the byte pattern[0] in string `subject`. +// Does not verify that the whole pattern matches. +template <> +inline size_t FindFirstCharacter(Vector pattern, + Vector subject, + size_t index) { + const uint8_t pattern_first_char = pattern[0]; + const size_t subj_len = subject.length(); + const size_t max_n = (subject.length() - pattern.length() + 1); + + const void* pos; + if (subject.forward()) { + pos = memchr(subject.start() + index, pattern_first_char, max_n - index); + } else { + pos = MemrchrFill(subject.start() + pattern.length() - 1, + pattern_first_char, + max_n - index); + } + const uint8_t* char_pos = static_cast(pos); + if (char_pos == nullptr) { + return subj_len; + } + + size_t raw_pos = static_cast(char_pos - subject.start()); + return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); +} + +//--------------------------------------------------------------------- +// Single Character Pattern Search Strategy +//--------------------------------------------------------------------- + +template +size_t StringSearch::SingleCharSearch( + Vector subject, + size_t index) { + NBYTES_ASSERT_TRUE(1 == pattern_.length()); + return FindFirstCharacter(pattern_, subject, index); +} + +//--------------------------------------------------------------------- +// Linear Search Strategy +//--------------------------------------------------------------------- + +// Simple linear search for short patterns. Never bails out. +template +size_t StringSearch::LinearSearch( + Vector subject, + size_t index) { + NBYTES_ASSERT_TRUE(pattern_.length() > 1); + const size_t n = subject.length() - pattern_.length(); + for (size_t i = index; i <= n; i++) { + i = FindFirstCharacter(pattern_, subject, i); + if (i == subject.length()) + return subject.length(); + NBYTES_ASSERT_TRUE(i <= n); + + bool matches = true; + for (size_t j = 1; j < pattern_.length(); j++) { + if (pattern_[j] != subject[i + j]) { + matches = false; + break; + } + } + if (matches) { + return i; + } + } + return subject.length(); +} + +//--------------------------------------------------------------------- +// Boyer-Moore string search +//--------------------------------------------------------------------- + +template +size_t StringSearch::BoyerMooreSearch( + Vector subject, + size_t start_index) { + const size_t subject_length = subject.length(); + const size_t pattern_length = pattern_.length(); + // Only preprocess at most kBMMaxShift last characters of pattern. + size_t start = start_; + + int* bad_char_occurrence = bad_char_shift_table_; + int* good_suffix_shift = good_suffix_shift_table_ - start_; + + Char last_char = pattern_[pattern_length - 1]; + size_t index = start_index; + // Continue search from i. + while (index <= subject_length - pattern_length) { + size_t j = pattern_length - 1; + int c; + while (last_char != (c = subject[index + j])) { + int shift = j - CharOccurrence(bad_char_occurrence, c); + index += shift; + if (index > subject_length - pattern_length) { + return subject.length(); + } + } + while (pattern_[j] == (c = subject[index + j])) { + if (j == 0) { + return index; + } + j--; + } + if (j < start) { + // we have matched more than our tables allow us to be smart about. + // Fall back on BMH shift. + index += pattern_length - 1 - + CharOccurrence(bad_char_occurrence, last_char); + } else { + int gs_shift = good_suffix_shift[j + 1]; + int bc_occ = CharOccurrence(bad_char_occurrence, c); + int shift = j - bc_occ; + if (gs_shift > shift) { + shift = gs_shift; + } + index += shift; + } + } + + return subject.length(); +} + +template +void StringSearch::PopulateBoyerMooreTable() { + const size_t pattern_length = pattern_.length(); + // Only look at the last kBMMaxShift characters of pattern (from start_ + // to pattern_length). + const size_t start = start_; + const size_t length = pattern_length - start; + + // Biased tables so that we can use pattern indices as table indices, + // even if we only cover the part of the pattern from offset start. + int* shift_table = good_suffix_shift_table_ - start_; + int* suffix_table = suffix_table_ - start_; + + // Initialize table. + for (size_t i = start; i < pattern_length; i++) { + shift_table[i] = length; + } + shift_table[pattern_length] = 1; + suffix_table[pattern_length] = pattern_length + 1; + + if (pattern_length <= start) { + return; + } + + // Find suffixes. + Char last_char = pattern_[pattern_length - 1]; + size_t suffix = pattern_length + 1; + { + size_t i = pattern_length; + while (i > start) { + Char c = pattern_[i - 1]; + while (suffix <= pattern_length && c != pattern_[suffix - 1]) { + if (static_cast(shift_table[suffix]) == length) { + shift_table[suffix] = suffix - i; + } + suffix = suffix_table[suffix]; + } + suffix_table[--i] = --suffix; + if (suffix == pattern_length) { + // No suffix to extend, so we check against last_char only. + while ((i > start) && (pattern_[i - 1] != last_char)) { + if (static_cast(shift_table[pattern_length]) == length) { + shift_table[pattern_length] = pattern_length - i; + } + suffix_table[--i] = pattern_length; + } + if (i > start) { + suffix_table[--i] = --suffix; + } + } + } + } + // Build shift table using suffixes. + if (suffix < pattern_length) { + for (size_t i = start; i <= pattern_length; i++) { + if (static_cast(shift_table[i]) == length) { + shift_table[i] = suffix - start; + } + if (i == suffix) { + suffix = suffix_table[suffix]; + } + } + } +} + +//--------------------------------------------------------------------- +// Boyer-Moore-Horspool string search. +//--------------------------------------------------------------------- + +template +size_t StringSearch::BoyerMooreHorspoolSearch( + Vector subject, + size_t start_index) { + const size_t subject_length = subject.length(); + const size_t pattern_length = pattern_.length(); + int* char_occurrences = bad_char_shift_table_; + int64_t badness = -static_cast(pattern_length); + + // How bad we are doing without a good-suffix table. + Char last_char = pattern_[pattern_length - 1]; + int last_char_shift = + pattern_length - 1 - + CharOccurrence(char_occurrences, last_char); + + // Perform search + size_t index = start_index; // No matches found prior to this index. + while (index <= subject_length - pattern_length) { + size_t j = pattern_length - 1; + int subject_char; + while (last_char != (subject_char = subject[index + j])) { + int bc_occ = CharOccurrence(char_occurrences, subject_char); + int shift = j - bc_occ; + index += shift; + badness += 1 - shift; // at most zero, so badness cannot increase. + if (index > subject_length - pattern_length) { + return subject_length; + } + } + j--; + while (pattern_[j] == (subject[index + j])) { + if (j == 0) { + return index; + } + j--; + } + index += last_char_shift; + // Badness increases by the number of characters we have + // checked, and decreases by the number of characters we + // can skip by shifting. It's a measure of how we are doing + // compared to reading each character exactly once. + badness += (pattern_length - j) - last_char_shift; + if (badness > 0) { + PopulateBoyerMooreTable(); + strategy_ = SearchStrategy::kBoyerMoore; + return BoyerMooreSearch(subject, index); + } + } + return subject.length(); +} + +template +void StringSearch::PopulateBoyerMooreHorspoolTable() { + const size_t pattern_length = pattern_.length(); + + int* bad_char_occurrence = bad_char_shift_table_; + + // Only preprocess at most kBMMaxShift last characters of pattern. + const size_t start = start_; + // Run forwards to populate bad_char_table, so that *last* instance + // of character equivalence class is the one registered. + // Notice: Doesn't include the last character. + const size_t table_size = AlphabetSize(); + if (start == 0) { + // All patterns less than kBMMaxShift in length. + memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); + } else { + for (size_t i = 0; i < table_size; i++) { + bad_char_occurrence[i] = start - 1; + } + } + for (size_t i = start; i < pattern_length - 1; i++) { + Char c = pattern_[i]; + int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); + bad_char_occurrence[bucket] = i; + } +} + +//--------------------------------------------------------------------- +// Linear string search with bailout to BMH. +//--------------------------------------------------------------------- + +// Simple linear search for short patterns, which bails out if the string +// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. +template +size_t StringSearch::InitialSearch( + Vector subject, + size_t index) { + const size_t pattern_length = pattern_.length(); + // Badness is a count of how much work we have done. When we have + // done enough work we decide it's probably worth switching to a better + // algorithm. + int64_t badness = -10 - (pattern_length << 2); + + // We know our pattern is at least 2 characters, we cache the first so + // the common case of the first character not matching is faster. + for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { + badness++; + if (badness <= 0) { + i = FindFirstCharacter(pattern_, subject, i); + if (i == subject.length()) + return subject.length(); + NBYTES_ASSERT_TRUE(i <= n); + size_t j = 1; + do { + if (pattern_[j] != subject[i + j]) { + break; + } + j++; + } while (j < pattern_length); + if (j == pattern_length) { + return i; + } + badness += j; + } else { + PopulateBoyerMooreHorspoolTable(); + strategy_ = SearchStrategy::kBoyerMooreHorspool; + return BoyerMooreHorspoolSearch(subject, i); + } + } + return subject.length(); +} + +// Perform a single stand-alone search. +// If searching multiple times for the same pattern, a search +// object should be constructed once and the Search function then called +// for each search. +template +size_t SearchString(Vector subject, + Vector pattern, + size_t start_index) { + StringSearch search(pattern); + return search.Search(subject, start_index); +} +} // namespace stringsearch + +template +size_t SearchString(const Char* haystack, + size_t haystack_length, + const Char* needle, + size_t needle_length, + size_t start_index, + bool is_forward) { + if (haystack_length < needle_length) return haystack_length; + // To do a reverse search (lastIndexOf instead of indexOf) without redundant + // code, create two vectors that are reversed views into the input strings. + // For example, v_needle[0] would return the *last* character of the needle. + // So we're searching for the first instance of rev(needle) in rev(haystack) + stringsearch::Vector v_needle(needle, needle_length, is_forward); + stringsearch::Vector v_haystack( + haystack, haystack_length, is_forward); + size_t diff = haystack_length - needle_length; + size_t relative_start_index; + if (is_forward) { + relative_start_index = start_index; + } else if (diff < start_index) { + relative_start_index = 0; + } else { + relative_start_index = diff - start_index; + } + size_t pos = stringsearch::SearchString( + v_haystack, v_needle, relative_start_index); + if (pos == haystack_length) { + // not found + return pos; + } + return is_forward ? pos : (haystack_length - needle_length - pos); +} + +template +size_t SearchString(const char* haystack, size_t haystack_length, + const char (&needle)[N]) { + return SearchString( + reinterpret_cast(haystack), haystack_length, + reinterpret_cast(needle), N - 1, 0, true); +} + +// ============================================================================ +// Version metadata +#define NBYTES_VERSION "0.0.1" + +enum { + NBYTES_VERSION_MAJOR = 0, + NBYTES_VERSION_MINOR = 0, + NBYTES_VERSION_REVISION = 1, +}; + } // namespace nbytes diff --git a/node.gyp b/node.gyp index 4419bf71dde523..0685742e8165fa 100644 --- a/node.gyp +++ b/node.gyp @@ -291,7 +291,6 @@ 'src/string_bytes.h', 'src/string_decoder.h', 'src/string_decoder-inl.h', - 'src/string_search.h', 'src/tcp_wrap.h', 'src/timers.h', 'src/tracing/agent.h', diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 53e4f2888fffdc..02a6a79492cf12 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -30,7 +30,7 @@ #include "env-inl.h" #include "simdutf.h" #include "string_bytes.h" -#include "string_search.h" + #include "util-inl.h" #include "v8-fast-api-calls.h" #include "v8.h" @@ -970,19 +970,20 @@ void IndexOfString(const FunctionCallbackInfo& args) { if (decoded_string == nullptr) return args.GetReturnValue().Set(-1); - result = SearchString(reinterpret_cast(haystack), - haystack_length / 2, - decoded_string, - decoder.size() / 2, - offset / 2, - is_forward); + result = nbytes::SearchString(reinterpret_cast(haystack), + haystack_length / 2, + decoded_string, + decoder.size() / 2, + offset / 2, + is_forward); } else { - result = SearchString(reinterpret_cast(haystack), - haystack_length / 2, - reinterpret_cast(*needle_value), - needle_value.length(), - offset / 2, - is_forward); + result = + nbytes::SearchString(reinterpret_cast(haystack), + haystack_length / 2, + reinterpret_cast(*needle_value), + needle_value.length(), + offset / 2, + is_forward); } result *= 2; } else if (enc == UTF8) { @@ -990,12 +991,13 @@ void IndexOfString(const FunctionCallbackInfo& args) { if (*needle_value == nullptr) return args.GetReturnValue().Set(-1); - result = SearchString(reinterpret_cast(haystack), - haystack_length, - reinterpret_cast(*needle_value), - needle_length, - offset, - is_forward); + result = + nbytes::SearchString(reinterpret_cast(haystack), + haystack_length, + reinterpret_cast(*needle_value), + needle_length, + offset, + is_forward); } else if (enc == LATIN1) { uint8_t* needle_data = node::UncheckedMalloc(needle_length); if (needle_data == nullptr) { @@ -1004,12 +1006,12 @@ void IndexOfString(const FunctionCallbackInfo& args) { needle->WriteOneByte( isolate, needle_data, 0, needle_length, String::NO_NULL_TERMINATION); - result = SearchString(reinterpret_cast(haystack), - haystack_length, - needle_data, - needle_length, - offset, - is_forward); + result = nbytes::SearchString(reinterpret_cast(haystack), + haystack_length, + needle_data, + needle_length, + offset, + is_forward); free(needle_data); } @@ -1068,22 +1070,20 @@ void IndexOfBuffer(const FunctionCallbackInfo& args) { if (haystack_length < 2 || needle_length < 2) { return args.GetReturnValue().Set(-1); } - result = SearchString( - reinterpret_cast(haystack), - haystack_length / 2, - reinterpret_cast(needle), - needle_length / 2, - offset / 2, - is_forward); + result = nbytes::SearchString(reinterpret_cast(haystack), + haystack_length / 2, + reinterpret_cast(needle), + needle_length / 2, + offset / 2, + is_forward); result *= 2; } else { - result = SearchString( - reinterpret_cast(haystack), - haystack_length, - reinterpret_cast(needle), - needle_length, - offset, - is_forward); + result = nbytes::SearchString(reinterpret_cast(haystack), + haystack_length, + reinterpret_cast(needle), + needle_length, + offset, + is_forward); } args.GetReturnValue().Set( @@ -1106,7 +1106,7 @@ int32_t IndexOfNumber(const uint8_t* buffer_data, if (is_forward) { ptr = memchr(buffer_data + offset, needle, buffer_length - offset); } else { - ptr = node::stringsearch::MemrchrFill(buffer_data, needle, offset + 1); + ptr = nbytes::stringsearch::MemrchrFill(buffer_data, needle, offset + 1); } const uint8_t* ptr_uint8 = static_cast(ptr); return ptr != nullptr ? static_cast(ptr_uint8 - buffer_data) : -1; diff --git a/src/node_metadata.cc b/src/node_metadata.cc index 3b7493f82b91b8..937e415eb55857 100644 --- a/src/node_metadata.cc +++ b/src/node_metadata.cc @@ -5,6 +5,7 @@ #include "brotli/encode.h" #include "cjs_module_lexer_version.h" #include "llhttp.h" +#include "nbytes.h" #include "nghttp2/nghttp2ver.h" #include "node.h" #include "simdjson.h" @@ -133,6 +134,7 @@ Metadata::Versions::Versions() { simdutf = SIMDUTF_VERSION; sqlite = SQLITE_VERSION; ada = ADA_VERSION; + nbytes = NBYTES_VERSION; } Metadata::Release::Release() : name(NODE_RELEASE) { diff --git a/src/node_metadata.h b/src/node_metadata.h index 5400220424e8d7..90c7dbf22c8e85 100644 --- a/src/node_metadata.h +++ b/src/node_metadata.h @@ -50,6 +50,7 @@ namespace node { V(simdutf) \ V(sqlite) \ V(ada) \ + V(nbytes) \ NODE_VERSIONS_KEY_UNDICI(V) \ V(cjs_module_lexer) diff --git a/src/string_search.h b/src/string_search.h deleted file mode 100644 index cd9ef320a81112..00000000000000 --- a/src/string_search.h +++ /dev/null @@ -1,638 +0,0 @@ -// Copyright 2011 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef SRC_STRING_SEARCH_H_ -#define SRC_STRING_SEARCH_H_ - -#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS - -#include "util.h" - -#include -#include - -namespace node { -namespace stringsearch { - -template -class Vector { - public: - Vector(T* data, size_t length, bool isForward) - : start_(data), length_(length), is_forward_(isForward) { - CHECK(length > 0 && data != nullptr); - } - - // Returns the start of the memory range. - // For vector v this is NOT necessarily &v[0], see forward(). - const T* start() const { return start_; } - - // Returns the length of the vector, in characters. - size_t length() const { return length_; } - - // Returns true if the Vector is front-to-back, false if back-to-front. - // In the latter case, v[0] corresponds to the *end* of the memory range. - bool forward() const { return is_forward_; } - - // Access individual vector elements - checks bounds in debug mode. - T& operator[](size_t index) const { - DCHECK_LT(index, length_); - return start_[is_forward_ ? index : (length_ - index - 1)]; - } - - private: - T* start_; - size_t length_; - bool is_forward_; -}; - - -//--------------------------------------------------------------------- -// String Search object. -//--------------------------------------------------------------------- - -// Class holding constants and methods that apply to all string search variants, -// independently of subject and pattern char size. -class StringSearchBase { - protected: - // Cap on the maximal shift in the Boyer-Moore implementation. By setting a - // limit, we can fix the size of tables. For a needle longer than this limit, - // search will not be optimal, since we only build tables for a suffix - // of the string, but it is a safe approximation. - static const int kBMMaxShift = 250; - - // Reduce alphabet to this size. - // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size - // proportional to the input alphabet. We reduce the alphabet size by - // equating input characters modulo a smaller alphabet size. This gives - // a potentially less efficient searching, but is a safe approximation. - // For needles using only characters in the same Unicode 256-code point page, - // there is no search speed degradation. - static const int kLatin1AlphabetSize = 256; - static const int kUC16AlphabetSize = 256; - - // Bad-char shift table stored in the state. It's length is the alphabet size. - // For patterns below this length, the skip length of Boyer-Moore is too short - // to compensate for the algorithmic overhead compared to simple brute force. - static const int kBMMinPatternLength = 8; - - // Store for the BoyerMoore(Horspool) bad char shift table. - int bad_char_shift_table_[kUC16AlphabetSize]; - // Store for the BoyerMoore good suffix shift table. - int good_suffix_shift_table_[kBMMaxShift + 1]; - // Table used temporarily while building the BoyerMoore good suffix - // shift table. - int suffix_table_[kBMMaxShift + 1]; -}; - -template -class StringSearch : private StringSearchBase { - public: - typedef stringsearch::Vector Vector; - - explicit StringSearch(Vector pattern) - : pattern_(pattern), start_(0) { - if (pattern.length() >= kBMMaxShift) { - start_ = pattern.length() - kBMMaxShift; - } - - size_t pattern_length = pattern_.length(); - CHECK_GT(pattern_length, 0); - if (pattern_length < kBMMinPatternLength) { - if (pattern_length == 1) { - strategy_ = SearchStrategy::kSingleChar; - return; - } - strategy_ = SearchStrategy::kLinear; - return; - } - strategy_ = SearchStrategy::kInitial; - } - - size_t Search(Vector subject, size_t index) { - switch (strategy_) { - case kBoyerMooreHorspool: - return BoyerMooreHorspoolSearch(subject, index); - case kBoyerMoore: - return BoyerMooreSearch(subject, index); - case kInitial: - return InitialSearch(subject, index); - case kLinear: - return LinearSearch(subject, index); - case kSingleChar: - return SingleCharSearch(subject, index); - } - UNREACHABLE(); - } - - static inline int AlphabetSize() { - if (sizeof(Char) == 1) { - // Latin1 needle. - return kLatin1AlphabetSize; - } else { - // UC16 needle. - return kUC16AlphabetSize; - } - - static_assert(sizeof(Char) == sizeof(uint8_t) || - sizeof(Char) == sizeof(uint16_t), - "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); - } - - private: - typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); - size_t SingleCharSearch(Vector subject, size_t start_index); - size_t LinearSearch(Vector subject, size_t start_index); - size_t InitialSearch(Vector subject, size_t start_index); - size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); - size_t BoyerMooreSearch(Vector subject, size_t start_index); - - void PopulateBoyerMooreHorspoolTable(); - - void PopulateBoyerMooreTable(); - - static inline int CharOccurrence(int* bad_char_occurrence, - Char char_code) { - if (sizeof(Char) == 1) { - return bad_char_occurrence[static_cast(char_code)]; - } - // Both pattern and subject are UC16. Reduce character to equivalence class. - int equiv_class = char_code % kUC16AlphabetSize; - return bad_char_occurrence[equiv_class]; - } - - enum SearchStrategy { - kBoyerMooreHorspool, - kBoyerMoore, - kInitial, - kLinear, - kSingleChar, - }; - - // The pattern to search for. - Vector pattern_; - SearchStrategy strategy_; - // Cache value of Max(0, pattern_length() - kBMMaxShift) - size_t start_; -}; - - -template -inline T AlignDown(T value, U alignment) { - return reinterpret_cast( - (reinterpret_cast(value) & ~(alignment - 1))); -} - - -inline uint8_t GetHighestValueByte(uint16_t character) { - return std::max(static_cast(character & 0xFF), - static_cast(character >> 8)); -} - - -inline uint8_t GetHighestValueByte(uint8_t character) { return character; } - - -// Searches for a byte value in a memory buffer, back to front. -// Uses memrchr(3) on systems which support it, for speed. -// Falls back to a vanilla for loop on non-GNU systems such as Windows. -inline const void* MemrchrFill(const void* haystack, uint8_t needle, - size_t haystack_len) { -#ifdef _GNU_SOURCE - return memrchr(haystack, needle, haystack_len); -#else - const uint8_t* haystack8 = static_cast(haystack); - for (size_t i = haystack_len - 1; i != static_cast(-1); i--) { - if (haystack8[i] == needle) { - return haystack8 + i; - } - } - return nullptr; -#endif -} - - -// Finds the first occurrence of *two-byte* character pattern[0] in the string -// `subject`. Does not check that the whole pattern matches. -template -inline size_t FindFirstCharacter(Vector pattern, - Vector subject, size_t index) { - const Char pattern_first_char = pattern[0]; - const size_t max_n = (subject.length() - pattern.length() + 1); - - // For speed, search for the more `rare` of the two bytes in pattern[0] - // using memchr / memrchr (which are much faster than a simple for loop). - const uint8_t search_byte = GetHighestValueByte(pattern_first_char); - size_t pos = index; - do { - const size_t bytes_to_search = (max_n - pos) * sizeof(Char); - const void* void_pos; - if (subject.forward()) { - // Assert that bytes_to_search won't overflow - CHECK_LE(pos, max_n); - CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); - void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); - } else { - CHECK_LE(pos, subject.length()); - CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); - void_pos = MemrchrFill(subject.start() + pattern.length() - 1, - search_byte, - bytes_to_search); - } - const Char* char_pos = static_cast(void_pos); - if (char_pos == nullptr) - return subject.length(); - - // Then, for each match, verify that the full two bytes match pattern[0]. - char_pos = AlignDown(char_pos, sizeof(Char)); - size_t raw_pos = static_cast(char_pos - subject.start()); - pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); - if (subject[pos] == pattern_first_char) { - // Match found, hooray. - return pos; - } - // Search byte matched, but the other byte of pattern[0] didn't. Keep going. - } while (++pos < max_n); - - return subject.length(); -} - - -// Finds the first occurrence of the byte pattern[0] in string `subject`. -// Does not verify that the whole pattern matches. -template <> -inline size_t FindFirstCharacter(Vector pattern, - Vector subject, - size_t index) { - const uint8_t pattern_first_char = pattern[0]; - const size_t subj_len = subject.length(); - const size_t max_n = (subject.length() - pattern.length() + 1); - - const void* pos; - if (subject.forward()) { - pos = memchr(subject.start() + index, pattern_first_char, max_n - index); - } else { - pos = MemrchrFill(subject.start() + pattern.length() - 1, - pattern_first_char, - max_n - index); - } - const uint8_t* char_pos = static_cast(pos); - if (char_pos == nullptr) { - return subj_len; - } - - size_t raw_pos = static_cast(char_pos - subject.start()); - return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); -} - -//--------------------------------------------------------------------- -// Single Character Pattern Search Strategy -//--------------------------------------------------------------------- - -template -size_t StringSearch::SingleCharSearch( - Vector subject, - size_t index) { - CHECK_EQ(1, pattern_.length()); - return FindFirstCharacter(pattern_, subject, index); -} - -//--------------------------------------------------------------------- -// Linear Search Strategy -//--------------------------------------------------------------------- - -// Simple linear search for short patterns. Never bails out. -template -size_t StringSearch::LinearSearch( - Vector subject, - size_t index) { - CHECK_GT(pattern_.length(), 1); - const size_t n = subject.length() - pattern_.length(); - for (size_t i = index; i <= n; i++) { - i = FindFirstCharacter(pattern_, subject, i); - if (i == subject.length()) - return subject.length(); - CHECK_LE(i, n); - - bool matches = true; - for (size_t j = 1; j < pattern_.length(); j++) { - if (pattern_[j] != subject[i + j]) { - matches = false; - break; - } - } - if (matches) { - return i; - } - } - return subject.length(); -} - -//--------------------------------------------------------------------- -// Boyer-Moore string search -//--------------------------------------------------------------------- - -template -size_t StringSearch::BoyerMooreSearch( - Vector subject, - size_t start_index) { - const size_t subject_length = subject.length(); - const size_t pattern_length = pattern_.length(); - // Only preprocess at most kBMMaxShift last characters of pattern. - size_t start = start_; - - int* bad_char_occurrence = bad_char_shift_table_; - int* good_suffix_shift = good_suffix_shift_table_ - start_; - - Char last_char = pattern_[pattern_length - 1]; - size_t index = start_index; - // Continue search from i. - while (index <= subject_length - pattern_length) { - size_t j = pattern_length - 1; - int c; - while (last_char != (c = subject[index + j])) { - int shift = j - CharOccurrence(bad_char_occurrence, c); - index += shift; - if (index > subject_length - pattern_length) { - return subject.length(); - } - } - while (pattern_[j] == (c = subject[index + j])) { - if (j == 0) { - return index; - } - j--; - } - if (j < start) { - // we have matched more than our tables allow us to be smart about. - // Fall back on BMH shift. - index += pattern_length - 1 - - CharOccurrence(bad_char_occurrence, last_char); - } else { - int gs_shift = good_suffix_shift[j + 1]; - int bc_occ = CharOccurrence(bad_char_occurrence, c); - int shift = j - bc_occ; - if (gs_shift > shift) { - shift = gs_shift; - } - index += shift; - } - } - - return subject.length(); -} - -template -void StringSearch::PopulateBoyerMooreTable() { - const size_t pattern_length = pattern_.length(); - // Only look at the last kBMMaxShift characters of pattern (from start_ - // to pattern_length). - const size_t start = start_; - const size_t length = pattern_length - start; - - // Biased tables so that we can use pattern indices as table indices, - // even if we only cover the part of the pattern from offset start. - int* shift_table = good_suffix_shift_table_ - start_; - int* suffix_table = suffix_table_ - start_; - - // Initialize table. - for (size_t i = start; i < pattern_length; i++) { - shift_table[i] = length; - } - shift_table[pattern_length] = 1; - suffix_table[pattern_length] = pattern_length + 1; - - if (pattern_length <= start) { - return; - } - - // Find suffixes. - Char last_char = pattern_[pattern_length - 1]; - size_t suffix = pattern_length + 1; - { - size_t i = pattern_length; - while (i > start) { - Char c = pattern_[i - 1]; - while (suffix <= pattern_length && c != pattern_[suffix - 1]) { - if (static_cast(shift_table[suffix]) == length) { - shift_table[suffix] = suffix - i; - } - suffix = suffix_table[suffix]; - } - suffix_table[--i] = --suffix; - if (suffix == pattern_length) { - // No suffix to extend, so we check against last_char only. - while ((i > start) && (pattern_[i - 1] != last_char)) { - if (static_cast(shift_table[pattern_length]) == length) { - shift_table[pattern_length] = pattern_length - i; - } - suffix_table[--i] = pattern_length; - } - if (i > start) { - suffix_table[--i] = --suffix; - } - } - } - } - // Build shift table using suffixes. - if (suffix < pattern_length) { - for (size_t i = start; i <= pattern_length; i++) { - if (static_cast(shift_table[i]) == length) { - shift_table[i] = suffix - start; - } - if (i == suffix) { - suffix = suffix_table[suffix]; - } - } - } -} - -//--------------------------------------------------------------------- -// Boyer-Moore-Horspool string search. -//--------------------------------------------------------------------- - -template -size_t StringSearch::BoyerMooreHorspoolSearch( - Vector subject, - size_t start_index) { - const size_t subject_length = subject.length(); - const size_t pattern_length = pattern_.length(); - int* char_occurrences = bad_char_shift_table_; - int64_t badness = -static_cast(pattern_length); - - // How bad we are doing without a good-suffix table. - Char last_char = pattern_[pattern_length - 1]; - int last_char_shift = - pattern_length - 1 - - CharOccurrence(char_occurrences, last_char); - - // Perform search - size_t index = start_index; // No matches found prior to this index. - while (index <= subject_length - pattern_length) { - size_t j = pattern_length - 1; - int subject_char; - while (last_char != (subject_char = subject[index + j])) { - int bc_occ = CharOccurrence(char_occurrences, subject_char); - int shift = j - bc_occ; - index += shift; - badness += 1 - shift; // at most zero, so badness cannot increase. - if (index > subject_length - pattern_length) { - return subject_length; - } - } - j--; - while (pattern_[j] == (subject[index + j])) { - if (j == 0) { - return index; - } - j--; - } - index += last_char_shift; - // Badness increases by the number of characters we have - // checked, and decreases by the number of characters we - // can skip by shifting. It's a measure of how we are doing - // compared to reading each character exactly once. - badness += (pattern_length - j) - last_char_shift; - if (badness > 0) { - PopulateBoyerMooreTable(); - strategy_ = SearchStrategy::kBoyerMoore; - return BoyerMooreSearch(subject, index); - } - } - return subject.length(); -} - -template -void StringSearch::PopulateBoyerMooreHorspoolTable() { - const size_t pattern_length = pattern_.length(); - - int* bad_char_occurrence = bad_char_shift_table_; - - // Only preprocess at most kBMMaxShift last characters of pattern. - const size_t start = start_; - // Run forwards to populate bad_char_table, so that *last* instance - // of character equivalence class is the one registered. - // Notice: Doesn't include the last character. - const size_t table_size = AlphabetSize(); - if (start == 0) { - // All patterns less than kBMMaxShift in length. - memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); - } else { - for (size_t i = 0; i < table_size; i++) { - bad_char_occurrence[i] = start - 1; - } - } - for (size_t i = start; i < pattern_length - 1; i++) { - Char c = pattern_[i]; - int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); - bad_char_occurrence[bucket] = i; - } -} - -//--------------------------------------------------------------------- -// Linear string search with bailout to BMH. -//--------------------------------------------------------------------- - -// Simple linear search for short patterns, which bails out if the string -// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. -template -size_t StringSearch::InitialSearch( - Vector subject, - size_t index) { - const size_t pattern_length = pattern_.length(); - // Badness is a count of how much work we have done. When we have - // done enough work we decide it's probably worth switching to a better - // algorithm. - int64_t badness = -10 - (pattern_length << 2); - - // We know our pattern is at least 2 characters, we cache the first so - // the common case of the first character not matching is faster. - for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { - badness++; - if (badness <= 0) { - i = FindFirstCharacter(pattern_, subject, i); - if (i == subject.length()) - return subject.length(); - CHECK_LE(i, n); - size_t j = 1; - do { - if (pattern_[j] != subject[i + j]) { - break; - } - j++; - } while (j < pattern_length); - if (j == pattern_length) { - return i; - } - badness += j; - } else { - PopulateBoyerMooreHorspoolTable(); - strategy_ = SearchStrategy::kBoyerMooreHorspool; - return BoyerMooreHorspoolSearch(subject, i); - } - } - return subject.length(); -} - -// Perform a single stand-alone search. -// If searching multiple times for the same pattern, a search -// object should be constructed once and the Search function then called -// for each search. -template -size_t SearchString(Vector subject, - Vector pattern, - size_t start_index) { - StringSearch search(pattern); - return search.Search(subject, start_index); -} -} // namespace stringsearch -} // namespace node - -namespace node { - -template -size_t SearchString(const Char* haystack, - size_t haystack_length, - const Char* needle, - size_t needle_length, - size_t start_index, - bool is_forward) { - if (haystack_length < needle_length) return haystack_length; - // To do a reverse search (lastIndexOf instead of indexOf) without redundant - // code, create two vectors that are reversed views into the input strings. - // For example, v_needle[0] would return the *last* character of the needle. - // So we're searching for the first instance of rev(needle) in rev(haystack) - stringsearch::Vector v_needle(needle, needle_length, is_forward); - stringsearch::Vector v_haystack( - haystack, haystack_length, is_forward); - size_t diff = haystack_length - needle_length; - size_t relative_start_index; - if (is_forward) { - relative_start_index = start_index; - } else if (diff < start_index) { - relative_start_index = 0; - } else { - relative_start_index = diff - start_index; - } - size_t pos = node::stringsearch::SearchString( - v_haystack, v_needle, relative_start_index); - if (pos == haystack_length) { - // not found - return pos; - } - return is_forward ? pos : (haystack_length - needle_length - pos); -} - -template -size_t SearchString(const char* haystack, size_t haystack_length, - const char (&needle)[N]) { - return SearchString( - reinterpret_cast(haystack), haystack_length, - reinterpret_cast(needle), N - 1, 0, true); -} - -} // namespace node - -#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS - -#endif // SRC_STRING_SEARCH_H_ diff --git a/test/parallel/test-process-versions.js b/test/parallel/test-process-versions.js index 374c339cd814f5..bab483a7e2637e 100644 --- a/test/parallel/test-process-versions.js +++ b/test/parallel/test-process-versions.js @@ -23,6 +23,7 @@ const expected_keys = [ 'sqlite', 'ada', 'cjs_module_lexer', + 'nbytes', ]; const hasUndici = process.config.variables.node_builtin_shareable_builtins.includes('deps/undici/undici.js'); @@ -62,6 +63,7 @@ assert.match(process.versions.brotli, commonTemplate); assert.match(process.versions.llhttp, commonTemplate); assert.match(process.versions.node, commonTemplate); assert.match(process.versions.uv, commonTemplate); +assert.match(process.versions.nbytes, commonTemplate); assert.match(process.versions.zlib, /^\d+(?:\.\d+){1,3}(?:-.*)?$/); if (hasUndici) {