-
Notifications
You must be signed in to change notification settings - Fork 8.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vectorize ROW initialization #15501
Vectorize ROW initialization #15501
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,6 +87,7 @@ IObject | |
iosfwd | ||
IPackage | ||
IPeasant | ||
isa | ||
ISetup | ||
isspace | ||
IStorage | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,8 @@ | |
#include "textBuffer.hpp" | ||
#include "../../types/inc/GlyphWidth.hpp" | ||
|
||
extern "C" int __isa_available; | ||
|
||
// The STL is missing a std::iota_n analogue for std::iota, so I made my own. | ||
template<typename OutIt, typename Diff, typename T> | ||
constexpr OutIt iota_n(OutIt dest, Diff count, T val) | ||
|
@@ -134,8 +136,117 @@ void ROW::Reset(const TextAttribute& attr) | |
|
||
void ROW::_init() noexcept | ||
{ | ||
std::fill_n(_chars.begin(), _columnCount, UNICODE_SPACE); | ||
#pragma warning(push) | ||
#pragma warning(disable : 26462) // The value pointed to by '...' is assigned only once, mark it as a pointer to const (con.4). | ||
#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). | ||
#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1). | ||
|
||
// Fills _charsBuffer with whitespace and correspondingly _charOffsets | ||
// with successive numbers from 0 to _columnCount+1. | ||
#if defined(TIL_SSE_INTRINSICS) | ||
alignas(__m256i) static constexpr uint16_t whitespaceData[]{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }; | ||
alignas(__m256i) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; | ||
alignas(__m256i) static constexpr uint16_t increment16Data[]{ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; | ||
alignas(__m128i) static constexpr uint16_t increment8Data[]{ 8, 8, 8, 8, 8, 8, 8, 8 }; | ||
|
||
// The AVX loop operates on 32 bytes at a minimum. Since _charsBuffer/_charOffsets uses 2 byte large | ||
// wchar_t/uint16_t respectively, this translates to 16-element writes, which equals a _columnCount of 15, | ||
// because it doesn't include the past-the-end char-offset as described in the _charOffsets member comment. | ||
if (__isa_available >= __ISA_AVAILABLE_AVX2 && _columnCount >= 15) | ||
{ | ||
auto chars = _charsBuffer; | ||
auto charOffsets = _charOffsets.data(); | ||
|
||
// The backing buffer for both chars and charOffsets is guaranteed to be 16-byte aligned, | ||
// but AVX operations are 32-byte large. As such, when we write out the last chunk, we | ||
// have to align it to the ends of the 2 buffers. This results in a potential overlap of | ||
// 16 bytes between the last write in the main loop below and the final write afterwards. | ||
// | ||
// An example: | ||
// If you have a terminal between 16 and 23 columns the buffer has a size of 48 bytes. | ||
// The main loop below will iterate once, as it writes out bytes 0-31 and then exits. | ||
// The final write afterwards cannot write bytes 32-63 because that would write | ||
// out of bounds. Instead it writes bytes 16-47, overwriting 16 overlapping bytes. | ||
// This is better than branching and switching to SSE2, because both things are slow. | ||
// | ||
// Since we want to exit the main loop with at least 1 write left to do as the final write, | ||
// we need to subtract 1 alignment from the buffer length (= 16 bytes). Since _columnCount is | ||
// in wchar_t's we subtract -8. The same applies to the ~7 here vs ~15. If you squint slightly | ||
// you'll see how this is effectively the inverse of what CalculateCharsBufferStride does. | ||
const auto tailColumnOffset = gsl::narrow_cast<uint16_t>((_columnCount - 8u) & ~7); | ||
const auto charsEndLoop = chars + tailColumnOffset; | ||
const auto charOffsetsEndLoop = charOffsets + tailColumnOffset; | ||
|
||
const auto whitespace = _mm256_load_si256(reinterpret_cast<const __m256i*>(&whitespaceData[0])); | ||
auto offsetsLoop = _mm256_load_si256(reinterpret_cast<const __m256i*>(&offsetsData[0])); | ||
const auto offsets = _mm256_add_epi16(offsetsLoop, _mm256_set1_epi16(tailColumnOffset)); | ||
|
||
if (chars < charsEndLoop) | ||
{ | ||
const auto increment = _mm256_load_si256(reinterpret_cast<const __m256i*>(&increment16Data[0])); | ||
|
||
do | ||
{ | ||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(chars), whitespace); | ||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsets), offsetsLoop); | ||
offsetsLoop = _mm256_add_epi16(offsetsLoop, increment); | ||
chars += 16; | ||
charOffsets += 16; | ||
} while (chars < charsEndLoop); | ||
} | ||
|
||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(charsEndLoop), whitespace); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so wait, this will write up to 15 things off the end of the buffer? and there's no risk that this is going to stomp anything important? Like, if the buffer is 17 columns wide... the char offsets buffer starts at alignment 16 from the end of the chars buffer, and the next ROW starts at alignment 16 from the end of the char offsets buffer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It writes up to 15 bytes off the end of the buffer, which at a granularity of |
||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsetsEndLoop), offsets); | ||
} | ||
else | ||
{ | ||
auto chars = _charsBuffer; | ||
auto charOffsets = _charOffsets.data(); | ||
const auto charsEnd = chars + _columnCount; | ||
|
||
const auto whitespace = _mm_load_si128(reinterpret_cast<const __m128i*>(&whitespaceData[0])); | ||
const auto increment = _mm_load_si128(reinterpret_cast<const __m128i*>(&increment8Data[0])); | ||
auto offsets = _mm_load_si128(reinterpret_cast<const __m128i*>(&offsetsData[0])); | ||
|
||
do | ||
{ | ||
_mm_storeu_si128(reinterpret_cast<__m128i*>(chars), whitespace); | ||
_mm_storeu_si128(reinterpret_cast<__m128i*>(charOffsets), offsets); | ||
offsets = _mm_add_epi16(offsets, increment); | ||
chars += 8; | ||
charOffsets += 8; | ||
// If _columnCount is something like 120, the actual backing buffer for charOffsets is 121 items large. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see, i guess this is the part that scares me. every time we talk about the width of the backing buffers we're like, "YUP it's always +1" when in truth it is up to +16 or +32 or something. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could add an "at least" in there when I merge main in. (It's only up to +8 btw.) |
||
// --> The while loop uses <= to emit at least 1 more write. | ||
} while (chars <= charsEnd); | ||
} | ||
#elif defined(TIL_ARM_NEON_INTRINSICS) | ||
alignas(uint16x8_t) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7 }; | ||
|
||
auto chars = _charsBuffer; | ||
auto charOffsets = _charOffsets.data(); | ||
const auto charsEnd = chars + _columnCount; | ||
|
||
const auto whitespace = vdupq_n_u16(L' '); | ||
const auto increment = vdupq_n_u16(8); | ||
auto offsets = vld1q_u16(&offsetsData[0]); | ||
|
||
do | ||
{ | ||
vst1q_u16(chars, whitespace); | ||
vst1q_u16(charOffsets, offsets); | ||
offsets = vaddq_u16(offsets, increment); | ||
chars += 8; | ||
charOffsets += 8; | ||
// If _columnCount is something like 120, the actual backing buffer for charOffsets is 121 items large. | ||
// --> The while loop uses <= to emit at least 1 more write. | ||
} while (chars <= charsEnd); | ||
#else | ||
#error "Vectorizing this function improves overall performance by up to 40%. Don't remove this warning, just add the vectorized code." | ||
std::fill_n(_charsBuffer, _columnCount, UNICODE_SPACE); | ||
std::iota(_charOffsets.begin(), _charOffsets.end(), uint16_t{ 0 }); | ||
#endif | ||
|
||
#pragma warning(push) | ||
} | ||
|
||
void ROW::TransferAttributes(const til::small_rle<TextAttribute, uint16_t, 1>& attr, til::CoordType newWidth) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,16 +60,19 @@ struct RowWriteState | |
class ROW final | ||
{ | ||
public: | ||
// The implicit agreement between ROW and TextBuffer is that TextBuffer supplies ROW with a charsBuffer of at | ||
// least `columns * sizeof(wchar_t)` bytes and a charOffsetsBuffer of at least `(columns + 1) * sizeof(uint16_t)` | ||
// bytes (see ROW::_charOffsets for why it needs space for 1 additional offset). | ||
// The implicit agreement between ROW and TextBuffer is that the `charsBuffer` and `charOffsetsBuffer` | ||
// arrays have a minimum alignment of 16 Bytes and a size of `rowWidth+1`. The former is used to | ||
// implement Reset() efficiently via SIMD and the latter is used to store the past-the-end offset | ||
// into the `charsBuffer`. Even though the `charsBuffer` could be only `rowWidth` large we need them | ||
// to be the same size so that the SIMD code can process both arrays in the same loop simultaneously. | ||
// This wastes up to 5.8% memory but increases overall scrolling performance by around 40%. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Aye. SIMD is free real estate in our CPUs - Let's use it. |
||
// These methods exists to make this agreement explicit and serve as a reminder. | ||
// | ||
// TextBuffer calculates the distance in bytes between two ROWs (_bufferRowStride) as the sum of these values. | ||
// As such it's important that we return sizes with a minimum alignment of alignof(ROW). | ||
static constexpr size_t CalculateRowSize() noexcept | ||
{ | ||
return sizeof(ROW); | ||
return (sizeof(ROW) + 15) & ~15; | ||
} | ||
static constexpr size_t CalculateCharsBufferSize(size_t columns) noexcept | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fwiw nobody ever sets this to true?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is part of #15498. I can pull that change into this branch and rebase it on main so we can merge it immediately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd love that! I'm personally OK merging these out of order, even if it means that the numbers in the perf discussion part are incorrect.