From b5c5804c8fc33b04f975ec41bda1fba86973aaed Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Mon, 5 Jun 2023 18:40:33 +0200 Subject: [PATCH] Vectorize ROW initialization --- .github/actions/spelling/allow/apis.txt | 1 + src/buffer/out/Row.cpp | 94 +++++++++++++++++++++++-- src/buffer/out/Row.hpp | 19 ++++- src/buffer/out/textBuffer.cpp | 8 +-- src/buffer/out/textBuffer.hpp | 2 +- 5 files changed, 110 insertions(+), 14 deletions(-) diff --git a/.github/actions/spelling/allow/apis.txt b/.github/actions/spelling/allow/apis.txt index 38ef5c30d01..44930f583b7 100644 --- a/.github/actions/spelling/allow/apis.txt +++ b/.github/actions/spelling/allow/apis.txt @@ -86,6 +86,7 @@ IObject iosfwd IPackage IPeasant +isa ISetup isspace IStorage diff --git a/src/buffer/out/Row.cpp b/src/buffer/out/Row.cpp index 50b4d3c347d..6dc886fc829 100644 --- a/src/buffer/out/Row.cpp +++ b/src/buffer/out/Row.cpp @@ -9,6 +9,8 @@ #include "textBuffer.hpp" #include "../../types/inc/GlyphWidth.hpp" +extern "C" long __isa_enabled; + // The STL is missing a std::iota_n analogue for std::iota, so I made my own. template constexpr OutIt iota_n(OutIt dest, Diff count, T val) @@ -82,10 +84,7 @@ ROW::ROW(wchar_t* charsBuffer, uint16_t* charOffsetsBuffer, uint16_t rowWidth, c _attr{ rowWidth, fillAttribute }, _columnCount{ rowWidth } { - if (_chars.data()) - { - _init(); - } + _init(); } void ROW::SetWrapForced(const bool wrap) noexcept @@ -124,7 +123,7 @@ LineRendition ROW::GetLineRendition() const noexcept // - Attr - The default attribute (color) to fill // Return Value: // - -void ROW::Reset(const TextAttribute& attr) +void ROW::Reset(const TextAttribute& attr) noexcept { _charsHeap.reset(); _chars = { _charsBuffer, _columnCount }; @@ -139,8 +138,89 @@ void ROW::Reset(const TextAttribute& attr) void ROW::_init() noexcept { - std::fill_n(_chars.begin(), _columnCount, UNICODE_SPACE); - std::iota(_charOffsets.begin(), _charOffsets.end(), uint16_t{ 0 }); +#pragma warning(push) +#pragma warning(disable : 26462) // The value pointed to by '...' is assigned only once, mark it as a pointer to const (con.4). +#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). +#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1). + + // Fills _charsBuffer with whitespace and correspondingly _charOffsets + // with successive numbers from 0 to _columnCount+1. +#if defined(TIL_SSE_INTRINSICS) + alignas(__m256i) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + + if ((__isa_enabled & (1 << __ISA_AVAILABLE_AVX2)) && _columnCount >= 16) + { + auto chars = _charsBuffer; + auto charOffsets = _charOffsets.data(); + + const auto tailColumnOffset = gsl::narrow_cast((_columnCount - 8u) & ~7); + const auto charsEndLoop = chars + tailColumnOffset; + const auto charOffsetsEndLoop = charOffsets + tailColumnOffset; + + const auto whitespace = _mm256_set1_epi16(L' '); + auto offsetsLoop = _mm256_load_si256(reinterpret_cast(&offsetsData[0])); + const auto offsets = _mm256_add_epi16(offsetsLoop, _mm256_set1_epi16(tailColumnOffset)); + + if (chars < charsEndLoop) + { + const auto increment = _mm256_set1_epi16(16); + + do + { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(chars), whitespace); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsets), offsetsLoop); + offsetsLoop = _mm256_add_epi16(offsetsLoop, increment); + chars += 16; + charOffsets += 16; + } while (chars < charsEndLoop); + } + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(charsEndLoop), whitespace); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsetsEndLoop), offsets); + } + else + { + auto chars = _charsBuffer; + auto charOffsets = _charOffsets.data(); + const auto charsEnd = chars + _columnCount; + + const auto whitespace = _mm_set1_epi16(L' '); + const auto increment = _mm_set1_epi16(8); + auto offsets = _mm_load_si128(reinterpret_cast(&offsetsData[0])); + + do + { + _mm_storeu_si128(reinterpret_cast<__m128i*>(chars), whitespace); + _mm_storeu_si128(reinterpret_cast<__m128i*>(charOffsets), offsets); + offsets = _mm_add_epi16(offsets, increment); + chars += 8; + charOffsets += 8; + } while (chars <= charsEnd); + } +#elif defined(TIL_ARM_NEON_INTRINSICS) + alignas(uint16x8_t) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7 }; + + auto chars = _charsBuffer; + auto charOffsets = _charOffsets.data(); + const auto charsEnd = chars + _columnCount; + + const auto whitespace = vdupq_n_u16(L' '); + const auto increment = vdupq_n_u16(8); + auto offsets = vld1q_u16(&offsetsData[0]); + + do + { + vst1q_u16(chars, whitespace); + vst1q_u16(charOffsets, offsets); + offsets = vaddq_u16(offsets, increment); + chars += 8; + charOffsets += 8; + } while (chars <= charsEnd); +#else +#error "This method benefits greatly from vectorization, which makes text processing significantly faster overall (+40% with AVX2). If you intentionally break my performance I'll break you. :)" +#endif + +#pragma warning(push) } void ROW::TransferAttributes(const til::small_rle& attr, til::CoordType newWidth) diff --git a/src/buffer/out/Row.hpp b/src/buffer/out/Row.hpp index d2afd5bf8b5..f41feb2fa70 100644 --- a/src/buffer/out/Row.hpp +++ b/src/buffer/out/Row.hpp @@ -60,6 +60,23 @@ struct RowWriteState class ROW final { public: + // The implicit agreement between ROW and TextBuffer is that the `charsBuffer` and `charOffsetsBuffer` + // arrays have a minimum alignment of 16 Bytes and a size of `rowWidth+1`. The former is used to + // implement Reset() efficiently via SIMD and the latter is used to store the past-the-end offset + // into the `charsBuffer`. Even though the `charsBuffer` could be only `rowWidth` large we need them + // to be the same size so that the SIMD code can process both arrays in the same loop simultaneously. + // This wastes up to 5.8% memory but increases overall scrolling performance by around 40%. + // + // This method exists to make this agreement explicit and serve as a reminder. + static constexpr size_t CalculateCharsBufferStride(size_t columns) noexcept + { + return (columns * sizeof(wchar_t) + 16) & ~15; + } + static constexpr size_t CalculateCharOffsetsBufferStride(size_t columns) noexcept + { + return (columns * sizeof(uint16_t) + 16) & ~15; + } + ROW() = default; ROW(wchar_t* charsBuffer, uint16_t* charOffsetsBuffer, uint16_t rowWidth, const TextAttribute& fillAttribute); @@ -76,7 +93,7 @@ class ROW final void SetLineRendition(const LineRendition lineRendition) noexcept; LineRendition GetLineRendition() const noexcept; - void Reset(const TextAttribute& attr); + void Reset(const TextAttribute& attr) noexcept; void TransferAttributes(const til::small_rle& attr, til::CoordType newWidth); til::CoordType NavigateToPrevious(til::CoordType column) const noexcept; diff --git a/src/buffer/out/textBuffer.cpp b/src/buffer/out/textBuffer.cpp index 4eb6d5e0689..894d4681de0 100644 --- a/src/buffer/out/textBuffer.cpp +++ b/src/buffer/out/textBuffer.cpp @@ -698,10 +698,8 @@ wil::unique_virtualalloc_ptr TextBuffer::_allocateBuffer(til::size sz const auto w = gsl::narrow(sz.width); const auto h = gsl::narrow(sz.height); - const auto charsBytes = w * sizeof(wchar_t); - // The ROW::_indices array stores 1 more item than the buffer is wide. - // That extra column stores the past-the-end _chars pointer. - const auto indicesBytes = w * sizeof(uint16_t) + sizeof(uint16_t); + const auto charsBytes = ROW::CalculateCharsBufferStride(w); + const auto indicesBytes = ROW::CalculateCharOffsetsBufferStride(w); const auto rowStride = charsBytes + indicesBytes; // 65535*65535 cells would result in a charsAreaSize of 8GiB. // --> Use uint64_t so that we can safely do our calculations even on x86. @@ -926,7 +924,7 @@ til::point TextBuffer::BufferToScreenPosition(const til::point position) const n // Routine Description: // - Resets the text contents of this buffer with the default character // and the default current color attributes -void TextBuffer::Reset() +void TextBuffer::Reset() noexcept { const auto attr = GetCurrentAttributes(); diff --git a/src/buffer/out/textBuffer.hpp b/src/buffer/out/textBuffer.hpp index 0826ec8d7e9..ac01fef8a0c 100644 --- a/src/buffer/out/textBuffer.hpp +++ b/src/buffer/out/textBuffer.hpp @@ -138,7 +138,7 @@ class TextBuffer final til::point ScreenToBufferPosition(const til::point position) const noexcept; til::point BufferToScreenPosition(const til::point position) const noexcept; - void Reset(); + void Reset() noexcept; [[nodiscard]] HRESULT ResizeTraditional(const til::size newSize) noexcept;