Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use inline Vector{128|256}.Create for constants #54827

Merged
3 commits merged into from
Jul 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 26 additions & 19 deletions src/libraries/System.Collections/src/System/Collections/BitArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,6 @@ public BitArray(byte[] bytes)
_version = 0;
}

private static readonly Vector128<byte> s_bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

private const uint Vector128ByteCount = 16;
private const uint Vector128IntCount = 4;
private const uint Vector256ByteCount = 32;
Expand Down Expand Up @@ -190,6 +186,10 @@ public unsafe BitArray(bool[] values)
// However comparison against zero can be replaced to cmeq against zero (vceqzq_s8)
// See dotnet/runtime#33972 for details
Vector128<byte> zero = Vector128<byte>.Zero;
Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

fixed (bool* ptr = values)
{
for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u)
Expand All @@ -199,15 +199,15 @@ public unsafe BitArray(bool[] values)
// and combine by ORing all of them together (In this case, adding all of them does the same thing)
Vector128<byte> lowerVector = AdvSimd.LoadVector128((byte*)ptr + i);
Vector128<byte> lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero);
Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128);
Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, bitMask128);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
Vector128<short> lowerPackedIsFalse = bitsExtracted1.AsInt16();

Vector128<byte> upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128<byte>.Count);
Vector128<byte> upperIsFalse = AdvSimd.CompareEqual(upperVector, zero);
Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, s_bitMask128);
Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, bitMask128);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
Expand Down Expand Up @@ -857,12 +857,6 @@ public int Length
}
}

// The mask used when shuffling a single int into Vector128/256.
// On little endian machines, the lower 8 bits of int belong in the first byte, next lower 8 in the second and so on.
// We place the bytes that contain the bits to its respective byte so that we can mask out only the relevant bits later.
private static readonly Vector128<byte> s_lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte();
private static readonly Vector128<byte> s_upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte();

public unsafe void CopyTo(Array array, int index)
{
if (array == null)
Expand Down Expand Up @@ -953,9 +947,15 @@ public unsafe void CopyTo(Array array, int index)
if (m_length < BitsPerInt32)
goto LessThan32;

// The mask used when shuffling a single int into Vector128/256.
// On little endian machines, the lower 8 bits of int belong in the first byte, next lower 8 in the second and so on.
// We place the bytes that contain the bits to its respective byte so that we can mask out only the relevant bits later.
Vector128<byte> lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte();
Vector128<byte> upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte();

if (Avx2.IsSupported)
{
Vector256<byte> shuffleMask = Vector256.Create(s_lowerShuffleMask_CopyToBoolArray, s_upperShuffleMask_CopyToBoolArray);
Vector256<byte> shuffleMask = Vector256.Create(lowerShuffleMask_CopyToBoolArray, upperShuffleMask_CopyToBoolArray);
Vector256<byte> bitMask = Vector256.Create(0x80402010_08040201).AsByte();
Vector256<byte> ones = Vector256.Create((byte)1);

Expand All @@ -977,9 +977,12 @@ public unsafe void CopyTo(Array array, int index)
}
else if (Ssse3.IsSupported)
{
Vector128<byte> lowerShuffleMask = s_lowerShuffleMask_CopyToBoolArray;
Vector128<byte> upperShuffleMask = s_upperShuffleMask_CopyToBoolArray;
Vector128<byte> lowerShuffleMask = lowerShuffleMask_CopyToBoolArray;
Vector128<byte> upperShuffleMask = upperShuffleMask_CopyToBoolArray;
Vector128<byte> ones = Vector128.Create((byte)1);
Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

fixed (bool* destination = &boolArray[index])
{
Expand All @@ -989,12 +992,12 @@ public unsafe void CopyTo(Array array, int index)
Vector128<int> scalar = Vector128.CreateScalarUnsafe(bits);

Vector128<byte> shuffledLower = Ssse3.Shuffle(scalar.AsByte(), lowerShuffleMask);
Vector128<byte> extractedLower = Sse2.And(shuffledLower, s_bitMask128);
Vector128<byte> extractedLower = Sse2.And(shuffledLower, bitMask128);
Vector128<byte> normalizedLower = Sse2.Min(extractedLower, ones);
Sse2.Store((byte*)destination + i, normalizedLower);

Vector128<byte> shuffledHigher = Ssse3.Shuffle(scalar.AsByte(), upperShuffleMask);
Vector128<byte> extractedHigher = Sse2.And(shuffledHigher, s_bitMask128);
Vector128<byte> extractedHigher = Sse2.And(shuffledHigher, bitMask128);
Vector128<byte> normalizedHigher = Sse2.Min(extractedHigher, ones);
Sse2.Store((byte*)destination + i + Vector128<byte>.Count, normalizedHigher);
}
Expand All @@ -1003,6 +1006,10 @@ public unsafe void CopyTo(Array array, int index)
else if (AdvSimd.IsSupported)
{
Vector128<byte> ones = Vector128.Create((byte)1);
Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

fixed (bool* destination = &boolArray[index])
{
for (; (i + Vector128ByteCount * 2u) <= (uint)m_length; i += Vector128ByteCount * 2u)
Expand All @@ -1028,12 +1035,12 @@ public unsafe void CopyTo(Array array, int index)
vector = AdvSimd.Arm64.ZipLow(vector, vector);

Vector128<byte> shuffledLower = AdvSimd.Arm64.ZipLow(vector, vector);
Vector128<byte> extractedLower = AdvSimd.And(shuffledLower, s_bitMask128);
Vector128<byte> extractedLower = AdvSimd.And(shuffledLower, bitMask128);
Vector128<byte> normalizedLower = AdvSimd.Min(extractedLower, ones);
AdvSimd.Store((byte*)destination + i, normalizedLower);

Vector128<byte> shuffledHigher = AdvSimd.Arm64.ZipHigh(vector, vector);
Vector128<byte> extractedHigher = AdvSimd.And(shuffledHigher, s_bitMask128);
Vector128<byte> extractedHigher = AdvSimd.And(shuffledHigher, bitMask128);
Vector128<byte> normalizedHigher = AdvSimd.Min(extractedHigher, ones);
AdvSimd.Store((byte*)destination + i + Vector128<byte>.Count, normalizedHigher);
}
Expand Down
8 changes: 0 additions & 8 deletions src/libraries/System.Memory/src/System/Buffers/Text/Base64.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,12 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using Internal.Runtime.CompilerServices;

namespace System.Buffers.Text
{
public static partial class Base64
{
private static TVector ReadVector<TVector>(ReadOnlySpan<sbyte> data)
{
ref sbyte tmp = ref MemoryMarshal.GetReference(data);
return Unsafe.As<sbyte, TVector>(ref tmp);
}

[Conditional("DEBUG")]
private static unsafe void AssertRead<TVector>(byte* src, byte* srcStart, int srcLength)
{
Expand Down
172 changes: 77 additions & 95 deletions src/libraries/System.Memory/src/System/Buffers/Text/Base64Decoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Spa
maxSrcLength = (destLength / 3) * 4;
}

ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap);
ref sbyte decodingMap = ref MemoryMarshal.GetReference(DecodingMap);
srcMax = srcBytes + (uint)maxSrcLength;

while (src < srcMax)
Expand Down Expand Up @@ -275,7 +275,7 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span<byte> buffer, ou
if (bufferLength == 0)
goto DoneExit;

ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap);
ref sbyte decodingMap = ref MemoryMarshal.GetReference(DecodingMap);

while (sourceIndex < bufferLength - 4)
{
Expand Down Expand Up @@ -362,14 +362,59 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b
// See SSSE3-version below for an explanation of how the code works.

// The JIT won't hoist these "constants", so help it
Vector256<sbyte> lutHi = ReadVector<Vector256<sbyte>>(s_avxDecodeLutHi);
Vector256<sbyte> lutLo = ReadVector<Vector256<sbyte>>(s_avxDecodeLutLo);
Vector256<sbyte> lutShift = ReadVector<Vector256<sbyte>>(s_avxDecodeLutShift);
Vector256<sbyte> lutHi = Vector256.Create(
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10);

Vector256<sbyte> lutLo = Vector256.Create(
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A);

Vector256<sbyte> lutShift = Vector256.Create(
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0,
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0);

Vector256<sbyte> packBytesInLaneMask = Vector256.Create(
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1,
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1);

Vector256<int> packLanesControl = Vector256.Create(
0, 0, 0, 0,
1, 0, 0, 0,
2, 0, 0, 0,
4, 0, 0, 0,
5, 0, 0, 0,
6, 0, 0, 0,
-1, -1, -1, -1,
-1, -1, -1, -1).AsInt32();

Vector256<sbyte> mask2F = Vector256.Create((sbyte)'/');
Vector256<sbyte> mergeConstant0 = Vector256.Create(0x01400140).AsSByte();
Vector256<short> mergeConstant1 = Vector256.Create(0x00011000).AsInt16();
Vector256<sbyte> packBytesInLaneMask = ReadVector<Vector256<sbyte>>(s_avxDecodePackBytesInLaneMask);
Vector256<int> packLanesControl = ReadVector<Vector256<sbyte>>(s_avxDecodePackLanesControl).AsInt32();

byte* src = srcBytes;
byte* dest = destBytes;
Expand Down Expand Up @@ -508,13 +553,33 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes,
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10

// The JIT won't hoist these "constants", so help it
Vector128<sbyte> lutHi = ReadVector<Vector128<sbyte>>(s_sseDecodeLutHi);
Vector128<sbyte> lutLo = ReadVector<Vector128<sbyte>>(s_sseDecodeLutLo);
Vector128<sbyte> lutShift = ReadVector<Vector128<sbyte>>(s_sseDecodeLutShift);
Vector128<sbyte> lutHi = Vector128.Create(
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10);

Vector128<sbyte> lutLo = Vector128.Create(
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A);

Vector128<sbyte> lutShift = Vector128.Create(
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0);

Vector128<sbyte> packBytesMask = Vector128.Create(
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1);

Vector128<sbyte> mask2F = Vector128.Create((sbyte)'/');
Vector128<sbyte> mergeConstant0 = Vector128.Create(0x01400140).AsSByte();
Vector128<short> mergeConstant1 = Vector128.Create(0x00011000).AsInt16();
Vector128<sbyte> packBytesMask = ReadVector<Vector128<sbyte>>(s_sseDecodePackBytesMask);
Vector128<sbyte> zero = Vector128<sbyte>.Zero;

byte* src = srcBytes;
Expand Down Expand Up @@ -613,7 +678,7 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value)
}

// Pre-computing this table using a custom string(s_characters) and GenerateDecodingMapAndVerify (found in tests)
private static ReadOnlySpan<sbyte> s_decodingMap => new sbyte[] {
private static ReadOnlySpan<sbyte> DecodingMap => new sbyte[] {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, //62 is placed at index 43 (for +), 63 at index 47 (for /)
Expand All @@ -631,88 +696,5 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value)
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
};

private static ReadOnlySpan<sbyte> s_sseDecodePackBytesMask => new sbyte[] {
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1
};

private static ReadOnlySpan<sbyte> s_sseDecodeLutLo => new sbyte[] {
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A
};

private static ReadOnlySpan<sbyte> s_sseDecodeLutHi => new sbyte[] {
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10
};

private static ReadOnlySpan<sbyte> s_sseDecodeLutShift => new sbyte[] {
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0
};

private static ReadOnlySpan<sbyte> s_avxDecodePackBytesInLaneMask => new sbyte[] {
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1,
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1
};

private static ReadOnlySpan<sbyte> s_avxDecodePackLanesControl => new sbyte[] {
0, 0, 0, 0,
1, 0, 0, 0,
2, 0, 0, 0,
4, 0, 0, 0,
5, 0, 0, 0,
6, 0, 0, 0,
-1, -1, -1, -1,
-1, -1, -1, -1
};

private static ReadOnlySpan<sbyte> s_avxDecodeLutLo => new sbyte[] {
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A
};

private static ReadOnlySpan<sbyte> s_avxDecodeLutHi => new sbyte[] {
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10
};

private static ReadOnlySpan<sbyte> s_avxDecodeLutShift => new sbyte[] {
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0,
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0
};
}
}
Loading