Skip to content

Commit

Permalink
Vectorize ROW initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
lhecker committed Jun 13, 2023
1 parent 8f8c79f commit a9763f7
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 9 deletions.
1 change: 1 addition & 0 deletions .github/actions/spelling/allow/apis.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ IObject
iosfwd
IPackage
IPeasant
isa
ISetup
isspace
IStorage
Expand Down
5 changes: 1 addition & 4 deletions .github/actions/spelling/expect/expect.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
aabbcc
aarch
ABANDONFONT
abbcc
ABCDEFGHIJKLMNOPQRSTUVWXY
Expand Down Expand Up @@ -157,7 +158,6 @@ capslock
CARETBLINKINGENABLED
CARRIAGERETURN
cascadia
castsi
catid
cazamor
CBash
Expand Down Expand Up @@ -216,7 +216,6 @@ cmder
CMDEXT
cmh
CMOUSEBUTTONS
cmpeq
cmt
cmw
cmyk
Expand Down Expand Up @@ -1024,7 +1023,6 @@ lnkd
lnkfile
LNM
LOADONCALL
loadu
LOBYTE
localappdata
locsrc
Expand Down Expand Up @@ -1155,7 +1153,6 @@ MOUSEACTIVATE
MOUSEFIRST
MOUSEHWHEEL
MOUSEMOVE
movemask
MOVESTART
msb
msctf
Expand Down
6 changes: 6 additions & 0 deletions .github/actions/spelling/patterns/patterns.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ ROY\sG\.\sBIV
# Python stringprefix / binaryprefix
\b(?:B|BR|Br|F|FR|Fr|R|RB|RF|Rb|Rf|U|UR|Ur|b|bR|br|f|fR|fr|r|rB|rF|rb|rf|u|uR|ur)'

# SSE intrinsics like "_mm_subs_epu16"
\b_mm(?:|256|512)_\w+\b

# ARM NEON intrinsics like "vsubq_u16"
\bv\w+_[fsu](?:8|16|32|64)\b

# Automatically suggested patterns
# hit-count: 3831 file-count: 582
# IServiceProvider
Expand Down
113 changes: 112 additions & 1 deletion src/buffer/out/Row.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "textBuffer.hpp"
#include "../../types/inc/GlyphWidth.hpp"

extern "C" int __isa_available;

// The STL is missing a std::iota_n analogue for std::iota, so I made my own.
template<typename OutIt, typename Diff, typename T>
constexpr OutIt iota_n(OutIt dest, Diff count, T val)
Expand Down Expand Up @@ -134,8 +136,117 @@ void ROW::Reset(const TextAttribute& attr)

void ROW::_init() noexcept
{
std::fill_n(_chars.begin(), _columnCount, UNICODE_SPACE);
#pragma warning(push)
#pragma warning(disable : 26462) // The value pointed to by '...' is assigned only once, mark it as a pointer to const (con.4).
#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1).

// Fills _charsBuffer with whitespace and correspondingly _charOffsets
// with successive numbers from 0 to _columnCount+1.
#if defined(TIL_SSE_INTRINSICS)
alignas(__m256i) static constexpr uint16_t whitespaceData[]{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 };
alignas(__m256i) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
alignas(__m256i) static constexpr uint16_t increment16Data[]{ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 };
alignas(__m128i) static constexpr uint16_t increment8Data[]{ 8, 8, 8, 8, 8, 8, 8, 8 };

// The AVX loop operates on 32 bytes at a minimum. Since _charsBuffer/_charOffsets uses 2 byte large
// wchar_t/uint16_t respectively, this translates to 16-element writes, which equals a _columnCount of 15,
// because it doesn't include the past-the-end char-offset as described in the _charOffsets member comment.
if (__isa_available >= __ISA_AVAILABLE_AVX2 && _columnCount >= 15)
{
auto chars = _charsBuffer;
auto charOffsets = _charOffsets.data();

// The backing buffer for both chars and charOffsets is guaranteed to be 16-byte aligned,
// but AVX operations are 32-byte large. As such, when we write out the last chunk, we
// have to align it to the ends of the 2 buffers. This results in a potential overlap of
// 16 bytes between the last write in the main loop below and the final write afterwards.
//
// An example:
// If you have a terminal between 16 and 23 columns the buffer has a size of 48 bytes.
// The main loop below will iterate once, as it writes out bytes 0-31 and then exits.
// The final write afterwards cannot write bytes 32-63 because that would write
// out of bounds. Instead it writes bytes 16-47, overwriting 16 overlapping bytes.
// This is better than branching and switching to SSE2, because both things are slow.
//
// Since we want to exit the main loop with at least 1 write left to do as the final write,
// we need to subtract 1 alignment from the buffer length (= 16 bytes). Since _columnCount is
// in wchar_t's we subtract -8. The same applies to the ~7 here vs ~15. If you squint slightly
// you'll see how this is effectively the inverse of what CalculateCharsBufferStride does.
const auto tailColumnOffset = gsl::narrow_cast<uint16_t>((_columnCount - 8u) & ~7);
const auto charsEndLoop = chars + tailColumnOffset;
const auto charOffsetsEndLoop = charOffsets + tailColumnOffset;

const auto whitespace = _mm256_load_si256(reinterpret_cast<const __m256i*>(&whitespaceData[0]));
auto offsetsLoop = _mm256_load_si256(reinterpret_cast<const __m256i*>(&offsetsData[0]));
const auto offsets = _mm256_add_epi16(offsetsLoop, _mm256_set1_epi16(tailColumnOffset));

if (chars < charsEndLoop)
{
const auto increment = _mm256_load_si256(reinterpret_cast<const __m256i*>(&increment16Data[0]));

do
{
_mm256_storeu_si256(reinterpret_cast<__m256i*>(chars), whitespace);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsets), offsetsLoop);
offsetsLoop = _mm256_add_epi16(offsetsLoop, increment);
chars += 16;
charOffsets += 16;
} while (chars < charsEndLoop);
}

_mm256_storeu_si256(reinterpret_cast<__m256i*>(charsEndLoop), whitespace);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsetsEndLoop), offsets);
}
else
{
auto chars = _charsBuffer;
auto charOffsets = _charOffsets.data();
const auto charsEnd = chars + _columnCount;

const auto whitespace = _mm_load_si128(reinterpret_cast<const __m128i*>(&whitespaceData[0]));
const auto increment = _mm_load_si128(reinterpret_cast<const __m128i*>(&increment8Data[0]));
auto offsets = _mm_load_si128(reinterpret_cast<const __m128i*>(&offsetsData[0]));

do
{
_mm_storeu_si128(reinterpret_cast<__m128i*>(chars), whitespace);
_mm_storeu_si128(reinterpret_cast<__m128i*>(charOffsets), offsets);
offsets = _mm_add_epi16(offsets, increment);
chars += 8;
charOffsets += 8;
// If _columnCount is something like 120, the actual backing buffer for charOffsets is 121 items large.
// --> The while loop uses <= to emit at least 1 more write.
} while (chars <= charsEnd);
}
#elif defined(TIL_ARM_NEON_INTRINSICS)
alignas(uint16x8_t) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7 };

auto chars = _charsBuffer;
auto charOffsets = _charOffsets.data();
const auto charsEnd = chars + _columnCount;

const auto whitespace = vdupq_n_u16(L' ');
const auto increment = vdupq_n_u16(8);
auto offsets = vld1q_u16(&offsetsData[0]);

do
{
vst1q_u16(chars, whitespace);
vst1q_u16(charOffsets, offsets);
offsets = vaddq_u16(offsets, increment);
chars += 8;
charOffsets += 8;
// If _columnCount is something like 120, the actual backing buffer for charOffsets is 121 items large.
// --> The while loop uses <= to emit at least 1 more write.
} while (chars <= charsEnd);
#else
#error "Vectorizing this function improves overall performance by up to 40%. Don't remove this warning, just add the vectorized code."
std::fill_n(_charsBuffer, _columnCount, UNICODE_SPACE);
std::iota(_charOffsets.begin(), _charOffsets.end(), uint16_t{ 0 });
#endif

#pragma warning(push)
}

void ROW::TransferAttributes(const til::small_rle<TextAttribute, uint16_t, 1>& attr, til::CoordType newWidth)
Expand Down
11 changes: 7 additions & 4 deletions src/buffer/out/Row.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,19 @@ struct RowWriteState
class ROW final
{
public:
// The implicit agreement between ROW and TextBuffer is that TextBuffer supplies ROW with a charsBuffer of at
// least `columns * sizeof(wchar_t)` bytes and a charOffsetsBuffer of at least `(columns + 1) * sizeof(uint16_t)`
// bytes (see ROW::_charOffsets for why it needs space for 1 additional offset).
// The implicit agreement between ROW and TextBuffer is that the `charsBuffer` and `charOffsetsBuffer`
// arrays have a minimum alignment of 16 Bytes and a size of `rowWidth+1`. The former is used to
// implement Reset() efficiently via SIMD and the latter is used to store the past-the-end offset
// into the `charsBuffer`. Even though the `charsBuffer` could be only `rowWidth` large we need them
// to be the same size so that the SIMD code can process both arrays in the same loop simultaneously.
// This wastes up to 5.8% memory but increases overall scrolling performance by around 40%.
// These methods exists to make this agreement explicit and serve as a reminder.
//
// TextBuffer calculates the distance in bytes between two ROWs (_bufferRowStride) as the sum of these values.
// As such it's important that we return sizes with a minimum alignment of alignof(ROW).
static constexpr size_t CalculateRowSize() noexcept
{
return sizeof(ROW);
return (sizeof(ROW) + 15) & ~15;
}
static constexpr size_t CalculateCharsBufferSize(size_t columns) noexcept
{
Expand Down
9 changes: 9 additions & 0 deletions src/inc/til.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@

#pragma once

// This is a copy of how DirectXMath.h determines _XM_SSE_INTRINSICS_ and _XM_ARM_NEON_INTRINSICS_.
#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC)
#define TIL_SSE_INTRINSICS
#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
#define TIL_ARM_NEON_INTRINSICS
#else
#define TIL_NO_INTRINSICS
#endif

#define _TIL_INLINEPREFIX __declspec(noinline) inline

#include "til/at.h"
Expand Down

0 comments on commit a9763f7

Please sign in to comment.