From 036e0a64bd190e80f1685c3b267e495b03bb2962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Foidl?= <gue@korporal.at>
Date: Wed, 29 May 2019 17:29:31 +0200
Subject: [PATCH] Base64 encoding with simd-support (#34529)

* Optimized scalar code-path

* Fixed label names

* Implemented vectorized versions

* Added reference to source of algorithm

* Added back missing namespace

* Unsafe.Add instead of Unsafe.Subtract

Fixed build-failure (https://ci3.dot.net/job/dotnet_corefx/job/master/job/linux-musl-TGroup_netcoreapp+CGroup_Debug+AGroup_x64+TestOuter_false_prtest/8247/console)
Seems like the internal Unsafe doesn't have a Subtract method, so use Add instead.

* Added THIRD-PARTY-NOTICES

* PR Feedback

* THIRD-PARTY-NOTICES in repo-base instead instead in folder

Cf. https://github.com/dotnet/corefx/pull/34529#issuecomment-453510246

* PR Feedback

* https://github.com/dotnet/corefx/pull/34529#discussion_r247200659
* https://github.com/dotnet/corefx/pull/34529#discussion_r247214904

* Rewritten to use raw-pointers instead of GC-tracked refs

Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r247197669

* Initialized the static fields directly (i.e. w/o cctor)

Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r247193419

* Added a test for decoding a (encoded) Guid

The case with decoding encoded 16 bytes was not covered by tests, so a wrong code got commited before, resulting
in DestinationTooSmall instead of the correct Done.

* EncodingMap / DecodingMap as byref instead of pointer

So got rid of the `rep stosd` in the prolog. Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r248075157

* PR Feedback

* https://github.com/dotnet/corefx/pull/34529#discussion_r262165689

* Debug.Fail instead throwing for the assertion

Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r263894301

* ROSpan for static data

* ROS for lookup maps

* In decode avoided stack spill and hoisted zero-vector outside the loops

Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r287613894

* Assert assumption about destLength

Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r287605561

* Added comments from original source and some changes to variable names

Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r287606634 and https://github.com/dotnet/corefx/pull/34529#discussion_r287606714

* Use TestZ instead of MoveMask in AVX2-path

Cf. https://github.com/dotnet/corefx/pull/34529#discussion_r287825385

* Fixed too complicated mask2F creation

Improved the version done in c8b6cb3387ca856f52d246ad260172c8fe1d9dcd, so the static data isn't needed and code is more compact and readable.
---
 THIRD-PARTY-NOTICES.TXT                       |  32 +
 src/System.Memory/src/System.Memory.csproj    |   1 +
 .../src/System/Buffers/Text/Base64.cs         |  48 ++
 .../src/System/Buffers/Text/Base64Decoder.cs  | 796 +++++++++++++-----
 .../src/System/Buffers/Text/Base64Encoder.cs  | 559 +++++++++---
 .../tests/Base64/Base64DecoderUnitTests.cs    |  14 +
 6 files changed, 1134 insertions(+), 316 deletions(-)
 create mode 100644 src/System.Memory/src/System/Buffers/Text/Base64.cs

diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT
index b25636f506b8..de86db916f79 100644
--- a/THIRD-PARTY-NOTICES.TXT
+++ b/THIRD-PARTY-NOTICES.TXT
@@ -332,3 +332,35 @@ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+License notice for vectorized base64 encoding / decoding
+--------------------------------------------------------
+
+Copyright (c) 2005-2007, Nick Galbreath
+Copyright (c) 2013-2017, Alfred Klomp
+Copyright (c) 2015-2017, Wojciech Mula
+Copyright (c) 2016-2017, Matthieu Darbois
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/System.Memory/src/System.Memory.csproj b/src/System.Memory/src/System.Memory.csproj
index b63a96df0a13..97ba874bb2c4 100644
--- a/src/System.Memory/src/System.Memory.csproj
+++ b/src/System.Memory/src/System.Memory.csproj
@@ -27,6 +27,7 @@
     <Compile Include="System\Buffers\SequenceReader.cs" />
     <Compile Include="System\Buffers\SequenceReader.Search.cs" />
     <Compile Include="System\Buffers\SequenceReaderExtensions.Binary.cs" />
+    <Compile Include="System\Buffers\Text\Base64.cs" />
     <Compile Include="System\Buffers\Text\Base64Decoder.cs" />
     <Compile Include="System\Buffers\Text\Base64Encoder.cs" />
     <Compile Include="System\Runtime\InteropServices\SequenceMarshal.cs" />
diff --git a/src/System.Memory/src/System/Buffers/Text/Base64.cs b/src/System.Memory/src/System/Buffers/Text/Base64.cs
new file mode 100644
index 000000000000..34f3af433db3
--- /dev/null
+++ b/src/System.Memory/src/System/Buffers/Text/Base64.cs
@@ -0,0 +1,48 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using Internal.Runtime.CompilerServices;
+
+namespace System.Buffers.Text
+{
+    public static partial class Base64
+    {
+        private static TVector ReadVector<TVector>(ReadOnlySpan<sbyte> data)
+        {
+            ref sbyte tmp = ref MemoryMarshal.GetReference(data);
+            return Unsafe.As<sbyte, TVector>(ref tmp);
+        }
+
+        [Conditional("DEBUG")]
+        private static unsafe void AssertRead<TVector>(byte* src, byte* srcStart, int srcLength)
+        {
+            int vectorElements = Unsafe.SizeOf<TVector>();
+            byte* readEnd = src + vectorElements;
+            byte* srcEnd = srcStart + srcLength;
+
+            if (readEnd > srcEnd)
+            {
+                int srcIndex = (int)(src - srcStart);
+                Debug.Fail($"Read for {typeof(TVector)} is not within safe bounds. srcIndex: {srcIndex}, srcLength: {srcLength}");
+            }
+        }
+
+        [Conditional("DEBUG")]
+        private static unsafe void AssertWrite<TVector>(byte* dest, byte* destStart, int destLength)
+        {
+            int vectorElements = Unsafe.SizeOf<TVector>();
+            byte* writeEnd = dest + vectorElements;
+            byte* destEnd = destStart + destLength;
+
+            if (writeEnd > destEnd)
+            {
+                int destIndex = (int)(dest - destStart);
+                Debug.Fail($"Write for {typeof(TVector)} is not within safe bounds. destIndex: {destIndex}, destLength: {destLength}");
+            }
+        }
+    }
+}
diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs
index d54a8d59ba57..ffb660aae7e8 100644
--- a/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs
+++ b/src/System.Memory/src/System/Buffers/Text/Base64Decoder.cs
@@ -2,12 +2,18 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using Internal.Runtime.CompilerServices;
 
 namespace System.Buffers.Text
 {
+    // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2
+    // SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3
+
     public static partial class Base64
     {
         /// <summary>
@@ -18,7 +24,7 @@ public static partial class Base64
         /// <param name="bytes">The output span which contains the result of the operation, i.e. the decoded binary data.</param>
         /// <param name="bytesConsumed">The number of input bytes consumed during the operation. This can be used to slice the input for subsequent calls, if necessary.</param>
         /// <param name="bytesWritten">The number of bytes written into the output span. This can be used to slice the output for subsequent calls, if necessary.</param>
-        /// <param name="isFinalBlock">True (default) when the input span contains the entire data to decode. 
+        /// <param name="isFinalBlock">True (default) when the input span contains the entire data to decode.
         /// Set to false only if it is known that the input span contains partial data with more data to follow.</param>
         /// <returns>It returns the OperationStatus enum values:
         /// - Done - on successful processing of the entire input span
@@ -27,144 +33,191 @@ public static partial class Base64
         /// - InvalidData - if the input contains bytes outside of the expected base 64 range, or if it contains invalid/more than two padding characters,
         ///   or if the input is incomplete (i.e. not a multiple of 4) and isFinalBlock is true.
         /// </returns>
-        public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Span<byte> bytes, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
+        public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Span<byte> bytes, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
         {
-            ref byte srcBytes = ref MemoryMarshal.GetReference(utf8);
-            ref byte destBytes = ref MemoryMarshal.GetReference(bytes);
-
-            int srcLength = utf8.Length & ~0x3;  // only decode input up to the closest multiple of 4.
-            int destLength = bytes.Length;
-
-            int sourceIndex = 0;
-            int destIndex = 0;
-
-            if (utf8.Length == 0)
-                goto DoneExit;
-
-            ref sbyte decodingMap = ref s_decodingMap[0];
-
-            // Last bytes could have padding characters, so process them separately and treat them as valid only if isFinalBlock is true
-            // if isFinalBlock is false, padding characters are considered invalid
-            int skipLastChunk = isFinalBlock ? 4 : 0;
-
-            int maxSrcLength = 0;
-            if (destLength >= GetMaxDecodedFromUtf8Length(srcLength))
-            {
-                maxSrcLength = srcLength - skipLastChunk;
-            }
-            else
-            {
-                // This should never overflow since destLength here is less than int.MaxValue / 4 * 3 (i.e. 1610612733)
-                // Therefore, (destLength / 3) * 4 will always be less than 2147483641
-                maxSrcLength = (destLength / 3) * 4;
-            }
-
-            while (sourceIndex < maxSrcLength)
+            if (utf8.IsEmpty)
             {
-                int result = Decode(ref Unsafe.Add(ref srcBytes, sourceIndex), ref decodingMap);
-                if (result < 0)
-                    goto InvalidExit;
-                WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, destIndex), result);
-                destIndex += 3;
-                sourceIndex += 4;
+                bytesConsumed = 0;
+                bytesWritten = 0;
+                return OperationStatus.Done;
             }
 
-            if (maxSrcLength != srcLength - skipLastChunk)
-                goto DestinationSmallExit;
-
-            // If input is less than 4 bytes, srcLength == sourceIndex == 0
-            // If input is not a multiple of 4, sourceIndex == srcLength != 0
-            if (sourceIndex == srcLength)
+            fixed (byte* srcBytes = &MemoryMarshal.GetReference(utf8))
+            fixed (byte* destBytes = &MemoryMarshal.GetReference(bytes))
             {
-                if (isFinalBlock)
-                    goto InvalidExit;
-                goto NeedMoreExit;
-            }
-
-            // if isFinalBlock is false, we will never reach this point
-
-            int i0 = Unsafe.Add(ref srcBytes, srcLength - 4);
-            int i1 = Unsafe.Add(ref srcBytes, srcLength - 3);
-            int i2 = Unsafe.Add(ref srcBytes, srcLength - 2);
-            int i3 = Unsafe.Add(ref srcBytes, srcLength - 1);
-
-            i0 = Unsafe.Add(ref decodingMap, i0);
-            i1 = Unsafe.Add(ref decodingMap, i1);
-
-            i0 <<= 18;
-            i1 <<= 12;
-
-            i0 |= i1;
-
-            if (i3 != EncodingPad)
-            {
-                i2 = Unsafe.Add(ref decodingMap, i2);
-                i3 = Unsafe.Add(ref decodingMap, i3);
-
-                i2 <<= 6;
-
-                i0 |= i3;
-                i0 |= i2;
-
-                if (i0 < 0)
-                    goto InvalidExit;
-                if (destIndex > destLength - 3)
-                    goto DestinationSmallExit;
-                WriteThreeLowOrderBytes(ref Unsafe.Add(ref destBytes, destIndex), i0);
-                destIndex += 3;
-            }
-            else if (i2 != EncodingPad)
-            {
-                i2 = Unsafe.Add(ref decodingMap, i2);
-
-                i2 <<= 6;
-
-                i0 |= i2;
-
-                if (i0 < 0)
-                    goto InvalidExit;
-                if (destIndex > destLength - 2)
-                    goto DestinationSmallExit;
-                Unsafe.Add(ref destBytes, destIndex) = (byte)(i0 >> 16);
-                Unsafe.Add(ref destBytes, destIndex + 1) = (byte)(i0 >> 8);
-                destIndex += 2;
-            }
-            else
-            {
-                if (i0 < 0)
-                    goto InvalidExit;
-                if (destIndex > destLength - 1)
-                    goto DestinationSmallExit;
-                Unsafe.Add(ref destBytes, destIndex) = (byte)(i0 >> 16);
-                destIndex += 1;
-            }
-
-            sourceIndex += 4;
-
-            if (srcLength != utf8.Length)
-                goto InvalidExit;
+                int srcLength = utf8.Length & ~0x3;  // only decode input up to the closest multiple of 4.
+                int destLength = bytes.Length;
+                int maxSrcLength = srcLength;
+                int decodedLength = GetMaxDecodedFromUtf8Length(srcLength);
+
+                // max. 2 padding chars
+                if (destLength < decodedLength - 2)
+                {
+                    // For overflow see comment below
+                    maxSrcLength = destLength / 3 * 4;
+                }
+
+                byte* src = srcBytes;
+                byte* dest = destBytes;
+                byte* srcEnd = srcBytes + (uint)srcLength;
+                byte* srcMax = srcBytes + (uint)maxSrcLength;
+
+                if (maxSrcLength >= 24)
+                {
+                    byte* end = srcMax - 45;
+                    if (Avx2.IsSupported && (end >= src))
+                    {
+                        Avx2Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
+
+                        if (src == srcEnd)
+                            goto DoneExit;
+                    }
+
+                    end = srcMax - 24;
+                    if (Ssse3.IsSupported && (end >= src))
+                    {
+                        Ssse3Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
+
+                        if (src == srcEnd)
+                            goto DoneExit;
+                    }
+                }
+
+                // Last bytes could have padding characters, so process them separately and treat them as valid only if isFinalBlock is true
+                // if isFinalBlock is false, padding characters are considered invalid
+                int skipLastChunk = isFinalBlock ? 4 : 0;
+
+                if (destLength >= decodedLength)
+                {
+                    maxSrcLength = srcLength - skipLastChunk;
+                }
+                else
+                {
+                    // This should never overflow since destLength here is less than int.MaxValue / 4 * 3 (i.e. 1610612733)
+                    // Therefore, (destLength / 3) * 4 will always be less than 2147483641
+                    Debug.Assert(destLength < (int.MaxValue / 4 * 3));
+                    maxSrcLength = (destLength / 3) * 4;
+                }
+
+                ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap);
+                srcMax = srcBytes + (uint)maxSrcLength;
+
+                while (src < srcMax)
+                {
+                    int result = Decode(src, ref decodingMap);
+
+                    if (result < 0)
+                        goto InvalidDataExit;
+
+                    WriteThreeLowOrderBytes(dest, result);
+                    src += 4;
+                    dest += 3;
+                }
+
+                if (maxSrcLength != srcLength - skipLastChunk)
+                    goto DestinationTooSmallExit;
+
+                // If input is less than 4 bytes, srcLength == sourceIndex == 0
+                // If input is not a multiple of 4, sourceIndex == srcLength != 0
+                if (src == srcEnd)
+                {
+                    if (isFinalBlock)
+                        goto InvalidDataExit;
+                    goto NeedMoreDataExit;
+                }
+
+                // if isFinalBlock is false, we will never reach this point
+
+                // Handle last four bytes. There are 0, 1, 2 padding chars.
+                uint t0 = srcEnd[-4];
+                uint t1 = srcEnd[-3];
+                uint t2 = srcEnd[-2];
+                uint t3 = srcEnd[-1];
+
+                int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0);
+                int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1);
+
+                i0 <<= 18;
+                i1 <<= 12;
+
+                i0 |= i1;
+
+                byte* destMax = destBytes + (uint)destLength;
+
+                if (t3 != EncodingPad)
+                {
+                    int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
+                    int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3);
+
+                    i2 <<= 6;
+
+                    i0 |= i3;
+                    i0 |= i2;
+
+                    if (i0 < 0)
+                        goto InvalidDataExit;
+                    if (dest + 3 > destMax)
+                        goto DestinationTooSmallExit;
+
+                    WriteThreeLowOrderBytes(dest, i0);
+                    dest += 3;
+                }
+                else if (t2 != EncodingPad)
+                {
+                    int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
+
+                    i2 <<= 6;
+
+                    i0 |= i2;
+
+                    if (i0 < 0)
+                        goto InvalidDataExit;
+                    if (dest + 2 > destMax)
+                        goto DestinationTooSmallExit;
+
+                    dest[0] = (byte)(i0 >> 16);
+                    dest[1] = (byte)(i0 >> 8);
+                    dest += 2;
+                }
+                else
+                {
+                    if (i0 < 0)
+                        goto InvalidDataExit;
+                    if (dest + 1 > destMax)
+                        goto DestinationTooSmallExit;
+
+                    dest[0] = (byte)(i0 >> 16);
+                    dest += 1;
+                }
+
+                src += 4;
+
+                if (srcLength != utf8.Length)
+                    goto InvalidDataExit;
 
             DoneExit:
-            bytesConsumed = sourceIndex;
-            bytesWritten = destIndex;
-            return OperationStatus.Done;
-
-        DestinationSmallExit:
-            if (srcLength != utf8.Length && isFinalBlock)
-                goto InvalidExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead
-            bytesConsumed = sourceIndex;
-            bytesWritten = destIndex;
-            return OperationStatus.DestinationTooSmall;
-
-        NeedMoreExit:
-            bytesConsumed = sourceIndex;
-            bytesWritten = destIndex;
-            return OperationStatus.NeedMoreData;
-
-        InvalidExit:
-            bytesConsumed = sourceIndex;
-            bytesWritten = destIndex;
-            return OperationStatus.InvalidData;
+                bytesConsumed = (int)(src - srcBytes);
+                bytesWritten = (int)(dest - destBytes);
+                return OperationStatus.Done;
+
+            DestinationTooSmallExit:
+                if (srcLength != utf8.Length && isFinalBlock)
+                    goto InvalidDataExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead
+
+                bytesConsumed = (int)(src - srcBytes);
+                bytesWritten = (int)(dest - destBytes);
+                return OperationStatus.DestinationTooSmall;
+
+            NeedMoreDataExit:
+                bytesConsumed = (int)(src - srcBytes);
+                bytesWritten = (int)(dest - destBytes);
+                return OperationStatus.NeedMoreData;
+
+            InvalidDataExit:
+                bytesConsumed = (int)(src - srcBytes);
+                bytesWritten = (int)(dest - destBytes);
+                return OperationStatus.InvalidData;
+            }
         }
 
         /// <summary>
@@ -191,109 +244,349 @@ public static int GetMaxDecodedFromUtf8Length(int length)
         /// <param name="bytesWritten">The number of bytes written into the buffer.</param>
         /// <returns>It returns the OperationStatus enum values:
         /// - Done - on successful processing of the entire input span
-        /// - InvalidData - if the input contains bytes outside of the expected base 64 range, or if it contains invalid/more than two padding characters, 
+        /// - InvalidData - if the input contains bytes outside of the expected base 64 range, or if it contains invalid/more than two padding characters,
         ///   or if the input is incomplete (i.e. not a multiple of 4).
         /// It does not return DestinationTooSmall since that is not possible for base 64 decoding.
-        /// It does not return NeedMoreData since this method tramples the data in the buffer and 
+        /// It does not return NeedMoreData since this method tramples the data in the buffer and
         /// hence can only be called once with all the data in the buffer.
         /// </returns>
-        public static OperationStatus DecodeFromUtf8InPlace(Span<byte> buffer, out int bytesWritten)
+        public static unsafe OperationStatus DecodeFromUtf8InPlace(Span<byte> buffer, out int bytesWritten)
         {
-            int bufferLength = buffer.Length;
-            int sourceIndex = 0;
-            int destIndex = 0;
+            if (buffer.IsEmpty)
+            {
+                bytesWritten = 0;
+                return OperationStatus.Done;
+            }
+
+            fixed (byte* bufferBytes = &MemoryMarshal.GetReference(buffer))
+            {
+                int bufferLength = buffer.Length;
+                uint sourceIndex = 0;
+                uint destIndex = 0;
 
-            // only decode input if it is a multiple of 4
-            if (bufferLength != ((bufferLength >> 2) * 4))
-                goto InvalidExit;
-            if (bufferLength == 0)
-                goto DoneExit;
+                // only decode input if it is a multiple of 4
+                if (bufferLength != ((bufferLength >> 2) * 4))
+                    goto InvalidExit;
+                if (bufferLength == 0)
+                    goto DoneExit;
 
-            ref byte bufferBytes = ref MemoryMarshal.GetReference(buffer);
+                ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap);
 
-            ref sbyte decodingMap = ref s_decodingMap[0];
+                while (sourceIndex < bufferLength - 4)
+                {
+                    int result = Decode(bufferBytes + sourceIndex, ref decodingMap);
+                    if (result < 0)
+                        goto InvalidExit;
+                    WriteThreeLowOrderBytes(bufferBytes + destIndex, result);
+                    destIndex += 3;
+                    sourceIndex += 4;
+                }
 
-            while (sourceIndex < bufferLength - 4)
-            {
-                int result = Decode(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref decodingMap);
-                if (result < 0)
-                    goto InvalidExit;
-                WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, destIndex), result);
-                destIndex += 3;
-                sourceIndex += 4;
-            }
+                uint t0 = bufferBytes[bufferLength - 4];
+                uint t1 = bufferBytes[bufferLength - 3];
+                uint t2 = bufferBytes[bufferLength - 2];
+                uint t3 = bufferBytes[bufferLength - 1];
 
-            int i0 = Unsafe.Add(ref bufferBytes, bufferLength - 4);
-            int i1 = Unsafe.Add(ref bufferBytes, bufferLength - 3);
-            int i2 = Unsafe.Add(ref bufferBytes, bufferLength - 2);
-            int i3 = Unsafe.Add(ref bufferBytes, bufferLength - 1);
+                int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0);
+                int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1);
 
-            i0 = Unsafe.Add(ref decodingMap, i0);
-            i1 = Unsafe.Add(ref decodingMap, i1);
+                i0 <<= 18;
+                i1 <<= 12;
 
-            i0 <<= 18;
-            i1 <<= 12;
+                i0 |= i1;
 
-            i0 |= i1;
+                if (t3 != EncodingPad)
+                {
+                    int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
+                    int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3);
 
-            if (i3 != EncodingPad)
-            {
-                i2 = Unsafe.Add(ref decodingMap, i2);
-                i3 = Unsafe.Add(ref decodingMap, i3);
+                    i2 <<= 6;
 
-                i2 <<= 6;
+                    i0 |= i3;
+                    i0 |= i2;
 
-                i0 |= i3;
-                i0 |= i2;
+                    if (i0 < 0)
+                        goto InvalidExit;
 
-                if (i0 < 0)
-                    goto InvalidExit;
-                WriteThreeLowOrderBytes(ref Unsafe.Add(ref bufferBytes, destIndex), i0);
-                destIndex += 3;
-            }
-            else if (i2 != EncodingPad)
-            {
-                i2 = Unsafe.Add(ref decodingMap, i2);
+                    WriteThreeLowOrderBytes(bufferBytes + destIndex, i0);
+                    destIndex += 3;
+                }
+                else if (t2 != EncodingPad)
+                {
+                    int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
 
-                i2 <<= 6;
+                    i2 <<= 6;
 
-                i0 |= i2;
+                    i0 |= i2;
 
-                if (i0 < 0)
-                    goto InvalidExit;
-                Unsafe.Add(ref bufferBytes, destIndex) = (byte)(i0 >> 16);
-                Unsafe.Add(ref bufferBytes, destIndex + 1) = (byte)(i0 >> 8);
-                destIndex += 2;
+                    if (i0 < 0)
+                        goto InvalidExit;
+
+                    bufferBytes[destIndex] = (byte)(i0 >> 16);
+                    bufferBytes[destIndex + 1] = (byte)(i0 >> 8);
+                    destIndex += 2;
+                }
+                else
+                {
+                    if (i0 < 0)
+                        goto InvalidExit;
+
+                    bufferBytes[destIndex] = (byte)(i0 >> 16);
+                    destIndex += 1;
+                }
+
+            DoneExit:
+                bytesWritten = (int)destIndex;
+                return OperationStatus.Done;
+
+            InvalidExit:
+                bytesWritten = (int)destIndex;
+                return OperationStatus.InvalidData;
             }
-            else
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
+        {
+            // If we have AVX2 support, pick off 32 bytes at a time for as long as we can,
+            // but make sure that we quit before seeing any == markers at the end of the
+            // string. Also, because we write 8 zeroes at the end of the output, ensure
+            // that there are at least 11 valid bytes of input data remaining to close the
+            // gap. 32 + 2 + 11 = 45 bytes.
+
+            // See SSSE3-version below for an explanation of how the code works.
+
+            // The JIT won't hoist these "constants", so help it
+            Vector256<sbyte> lutHi = ReadVector<Vector256<sbyte>>(s_avxDecodeLutHi);
+            Vector256<sbyte> lutLo = ReadVector<Vector256<sbyte>>(s_avxDecodeLutLo);
+            Vector256<sbyte> lutShift = ReadVector<Vector256<sbyte>>(s_avxDecodeLutShift);
+            Vector256<sbyte> mask2F = Vector256.Create((sbyte)'/');
+            Vector256<sbyte> mergeConstant0 = Vector256.Create(0x01400140).AsSByte();
+            Vector256<short> mergeConstant1 = Vector256.Create(0x00011000).AsInt16();
+            Vector256<sbyte> packBytesInLaneMask = ReadVector<Vector256<sbyte>>(s_avxDecodePackBytesInLaneMask);
+            Vector256<int> packLanesControl = ReadVector<Vector256<sbyte>>(s_avxDecodePackLanesControl).AsInt32();
+
+            byte* src = srcBytes;
+            byte* dest = destBytes;
+
+            //while (remaining >= 45)
+            do
             {
-                if (i0 < 0)
-                    goto InvalidExit;
-                Unsafe.Add(ref bufferBytes, destIndex) = (byte)(i0 >> 16);
-                destIndex += 1;
+                AssertRead<Vector256<sbyte>>(src, srcStart, sourceLength);
+                Vector256<sbyte> str = Avx.LoadVector256(src).AsSByte();
+
+                Vector256<sbyte> hiNibbles = Avx2.And(Avx2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F);
+                Vector256<sbyte> loNibbles = Avx2.And(str, mask2F);
+                Vector256<sbyte> hi = Avx2.Shuffle(lutHi, hiNibbles);
+                Vector256<sbyte> lo = Avx2.Shuffle(lutLo, loNibbles);
+
+                if (!Avx.TestZ(lo, hi))
+                    break;
+
+                Vector256<sbyte> eq2F = Avx2.CompareEqual(str, mask2F);
+                Vector256<sbyte> shift = Avx2.Shuffle(lutShift, Avx2.Add(eq2F, hiNibbles));
+                str = Avx2.Add(str, shift);
+
+                // in, lower lane, bits, upper case are most significant bits, lower case are least significant bits:
+                // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+                // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+                // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+                // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+                Vector256<short> merge_ab_and_bc = Avx2.MultiplyAddAdjacent(str.AsByte(), mergeConstant0);
+                // 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+                // 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+                // 0000eeee FFffffff 0000DDDD DDddEEEE
+                // 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+                Vector256<int> output = Avx2.MultiplyAddAdjacent(merge_ab_and_bc, mergeConstant1);
+                // 00000000 JJJJJJjj KKKKkkkk LLllllll
+                // 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+                // 00000000 DDDDDDdd EEEEeeee FFffffff
+                // 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+                // Pack bytes together in each lane:
+                output = Avx2.Shuffle(output.AsSByte(), packBytesInLaneMask).AsInt32();
+                // 00000000 00000000 00000000 00000000
+                // LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+                // HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+                // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+
+                // Pack lanes
+                str = Avx2.PermuteVar8x32(output, packLanesControl).AsSByte();
+
+                AssertWrite<Vector256<sbyte>>(dest, destStart, destLength);
+                Avx.Store(dest, str.AsByte());
+
+                src += 32;
+                dest += 24;
             }
+            while (src <= srcEnd);
 
-        DoneExit:
-            bytesWritten = destIndex;
-            return OperationStatus.Done;
+            srcBytes = src;
+            destBytes = dest;
+        }
 
-        InvalidExit:
-            bytesWritten = destIndex;
-            return OperationStatus.InvalidData;
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
+        {
+            // If we have SSSE3 support, pick off 16 bytes at a time for as long as we can,
+            // but make sure that we quit before seeing any == markers at the end of the
+            // string. Also, because we write four zeroes at the end of the output, ensure
+            // that there are at least 6 valid bytes of input data remaining to close the
+            // gap. 16 + 2 + 6 = 24 bytes.
+
+            // The input consists of six character sets in the Base64 alphabet,
+            // which we need to map back to the 6-bit values they represent.
+            // There are three ranges, two singles, and then there's the rest.
+            //
+            //  #  From       To        Add  Characters
+            //  1  [43]       [62]      +19  +
+            //  2  [47]       [63]      +16  /
+            //  3  [48..57]   [52..61]   +4  0..9
+            //  4  [65..90]   [0..25]   -65  A..Z
+            //  5  [97..122]  [26..51]  -71  a..z
+            // (6) Everything else => invalid input
+
+            // We will use LUTS for character validation & offset computation
+            // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8,
+            // this allows to mask with 0x2F instead of 0x0F and thus save one constant declaration (register and/or memory access)
+
+            // For offsets:
+            // Perfect hash for lut = ((src>>4)&0x2F)+((src==0x2F)?0xFF:0x00)
+            // 0000 = garbage
+            // 0001 = /
+            // 0010 = +
+            // 0011 = 0-9
+            // 0100 = A-Z
+            // 0101 = A-Z
+            // 0110 = a-z
+            // 0111 = a-z
+            // 1000 >= garbage
+
+            // For validation, here's the table.
+            // A character is valid if and only if the AND of the 2 lookups equals 0:
+
+            // hi \ lo              0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
+            //      LUT             0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
+
+            // 0000 0X10 char        NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL   BS   HT   LF   VT   FF   CR   SO   SI
+            //           andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+
+            // 0001 0x10 char        DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN   EM  SUB  ESC   FS   GS   RS   US
+            //           andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+
+            // 0010 0x01 char               !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
+            //           andlut     0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
+
+            // 0011 0x02 char          0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
+            //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
+
+            // 0100 0x04 char          @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    0
+            //           andlut     0x04 0x00 0x00 0x00 0X00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+
+            // 0101 0x08 char          P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
+            //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
+
+            // 0110 0x04 char          `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
+            //           andlut     0x04 0x00 0x00 0x00 0X00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+            // 0111 0X08 char          p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
+            //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
+
+            // 1000 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+            // 1001 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+            // 1010 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+            // 1011 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+            // 1100 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+            // 1101 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+            // 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+            // 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+
+            // The JIT won't hoist these "constants", so help it
+            Vector128<sbyte> lutHi = ReadVector<Vector128<sbyte>>(s_sseDecodeLutHi);
+            Vector128<sbyte> lutLo = ReadVector<Vector128<sbyte>>(s_sseDecodeLutLo);
+            Vector128<sbyte> lutShift = ReadVector<Vector128<sbyte>>(s_sseDecodeLutShift);
+            Vector128<sbyte> mask2F = Vector128.Create((sbyte)'/');
+            Vector128<sbyte> mergeConstant0 = Vector128.Create(0x01400140).AsSByte();
+            Vector128<short> mergeConstant1 = Vector128.Create(0x00011000).AsInt16();
+            Vector128<sbyte> packBytesMask = ReadVector<Vector128<sbyte>>(s_sseDecodePackBytesMask);
+            Vector128<sbyte> zero = Vector128<sbyte>.Zero;
+
+            byte* src = srcBytes;
+            byte* dest = destBytes;
+
+            //while (remaining >= 24)
+            do
+            {
+                AssertRead<Vector128<sbyte>>(src, srcStart, sourceLength);
+                Vector128<sbyte> str = Sse2.LoadVector128(src).AsSByte();
+
+                // lookup
+                Vector128<sbyte> hiNibbles = Sse2.And(Sse2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F);
+                Vector128<sbyte> loNibbles = Sse2.And(str, mask2F);
+                Vector128<sbyte> hi = Ssse3.Shuffle(lutHi, hiNibbles);
+                Vector128<sbyte> lo = Ssse3.Shuffle(lutLo, loNibbles);
+
+                // Check for invalid input: if any "and" values from lo and hi are not zero,
+                // fall back on bytewise code to do error checking and reporting:
+                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.And(lo, hi), zero)) != 0)
+                    break;
+
+                Vector128<sbyte> eq2F = Sse2.CompareEqual(str, mask2F);
+                Vector128<sbyte> shift = Ssse3.Shuffle(lutShift, Sse2.Add(eq2F, hiNibbles));
+
+                // Now simply add the delta values to the input:
+                str = Sse2.Add(str, shift);
+
+                // in, bits, upper case are most significant bits, lower case are least significant bits
+                // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+                // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+                // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+                // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+                Vector128<short> merge_ab_and_bc = Ssse3.MultiplyAddAdjacent(str.AsByte(), mergeConstant0);
+                // 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+                // 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+                // 0000eeee FFffffff 0000DDDD DDddEEEE
+                // 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+                Vector128<int> output = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, mergeConstant1);
+                // 00000000 JJJJJJjj KKKKkkkk LLllllll
+                // 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+                // 00000000 DDDDDDdd EEEEeeee FFffffff
+                // 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+                // Pack bytes together:
+                str = Ssse3.Shuffle(output.AsSByte(), packBytesMask);
+                // 00000000 00000000 00000000 00000000
+                // LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+                // HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+                // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+
+                AssertWrite<Vector128<sbyte>>(dest, destStart, destLength);
+                Sse2.Store(dest, str.AsByte());
+
+                src += 16;
+                dest += 12;
+            }
+            while (src <= srcEnd);
+
+            srcBytes = src;
+            destBytes = dest;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int Decode(ref byte encodedBytes, ref sbyte decodingMap)
+        private static unsafe int Decode(byte* encodedBytes, ref sbyte decodingMap)
         {
-            int i0 = encodedBytes;
-            int i1 = Unsafe.Add(ref encodedBytes, 1);
-            int i2 = Unsafe.Add(ref encodedBytes, 2);
-            int i3 = Unsafe.Add(ref encodedBytes, 3);
+            uint t0 = encodedBytes[0];
+            uint t1 = encodedBytes[1];
+            uint t2 = encodedBytes[2];
+            uint t3 = encodedBytes[3];
 
-            i0 = Unsafe.Add(ref decodingMap, i0);
-            i1 = Unsafe.Add(ref decodingMap, i1);
-            i2 = Unsafe.Add(ref decodingMap, i2);
-            i3 = Unsafe.Add(ref decodingMap, i3);
+            int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0);
+            int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1);
+            int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
+            int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3);
 
             i0 <<= 18;
             i1 <<= 12;
@@ -307,15 +600,15 @@ private static int Decode(ref byte encodedBytes, ref sbyte decodingMap)
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void WriteThreeLowOrderBytes(ref byte destination, int value)
+        private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value)
         {
-            destination = (byte)(value >> 16);
-            Unsafe.Add(ref destination, 1) = (byte)(value >> 8);
-            Unsafe.Add(ref destination, 2) = (byte)value;
+            destination[0] = (byte)(value >> 16);
+            destination[1] = (byte)(value >> 8);
+            destination[2] = (byte)(value);
         }
 
         // Pre-computing this table using a custom string(s_characters) and GenerateDecodingMapAndVerify (found in tests)
-        private static readonly sbyte[] s_decodingMap = {
+        private static ReadOnlySpan<sbyte> s_decodingMap => new sbyte[] {
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,         //62 is placed at index 43 (for +), 63 at index 47 (for /)
@@ -333,5 +626,88 @@ private static void WriteThreeLowOrderBytes(ref byte destination, int value)
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         };
+
+        private static ReadOnlySpan<sbyte> s_sseDecodePackBytesMask => new sbyte[] {
+            2, 1, 0, 6,
+            5, 4, 10, 9,
+            8, 14, 13, 12,
+            -1, -1, -1, -1
+        };
+
+        private static ReadOnlySpan<sbyte> s_sseDecodeLutLo => new sbyte[] {
+            0x15, 0x11, 0x11, 0x11,
+            0x11, 0x11, 0x11, 0x11,
+            0x11, 0x11, 0x13, 0x1A,
+            0x1B, 0x1B, 0x1B, 0x1A
+        };
+
+        private static ReadOnlySpan<sbyte> s_sseDecodeLutHi => new sbyte[] {
+            0x10, 0x10, 0x01, 0x02,
+            0x04, 0x08, 0x04, 0x08,
+            0x10, 0x10, 0x10, 0x10,
+            0x10, 0x10, 0x10, 0x10
+        };
+
+        private static ReadOnlySpan<sbyte> s_sseDecodeLutShift => new sbyte[] {
+            0, 16, 19, 4,
+            -65, -65, -71, -71,
+            0, 0, 0, 0,
+            0, 0, 0, 0
+        };
+
+        private static ReadOnlySpan<sbyte> s_avxDecodePackBytesInLaneMask => new sbyte[] {
+            2, 1, 0, 6,
+            5, 4, 10, 9,
+            8, 14, 13, 12,
+            -1, -1, -1, -1,
+            2, 1, 0, 6,
+            5, 4, 10, 9,
+            8, 14, 13, 12,
+            -1, -1, -1, -1
+        };
+
+        private static ReadOnlySpan<sbyte> s_avxDecodePackLanesControl => new sbyte[] {
+            0, 0, 0, 0,
+            1, 0, 0, 0,
+            2, 0, 0, 0,
+            4, 0, 0, 0,
+            5, 0, 0, 0,
+            6, 0, 0, 0,
+            -1, -1, -1, -1,
+            -1, -1, -1, -1
+        };
+
+        private static ReadOnlySpan<sbyte> s_avxDecodeLutLo => new sbyte[] {
+            0x15, 0x11, 0x11, 0x11,
+            0x11, 0x11, 0x11, 0x11,
+            0x11, 0x11, 0x13, 0x1A,
+            0x1B, 0x1B, 0x1B, 0x1A,
+            0x15, 0x11, 0x11, 0x11,
+            0x11, 0x11, 0x11, 0x11,
+            0x11, 0x11, 0x13, 0x1A,
+            0x1B, 0x1B, 0x1B, 0x1A
+        };
+
+        private static ReadOnlySpan<sbyte> s_avxDecodeLutHi => new sbyte[] {
+            0x10, 0x10, 0x01, 0x02,
+            0x04, 0x08, 0x04, 0x08,
+            0x10, 0x10, 0x10, 0x10,
+            0x10, 0x10, 0x10, 0x10,
+            0x10, 0x10, 0x01, 0x02,
+            0x04, 0x08, 0x04, 0x08,
+            0x10, 0x10, 0x10, 0x10,
+            0x10, 0x10, 0x10, 0x10
+        };
+
+        private static ReadOnlySpan<sbyte> s_avxDecodeLutShift => new sbyte[] {
+            0, 16, 19, 4,
+            -65, -65, -71, -71,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 16, 19, 4,
+            -65, -65, -71, -71,
+            0, 0, 0, 0,
+            0, 0, 0, 0
+        };
     }
 }
diff --git a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs
index 4bb7cabc40f7..033978c003fa 100644
--- a/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs
+++ b/src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs
@@ -4,10 +4,15 @@
 
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using Internal.Runtime.CompilerServices;
 
 namespace System.Buffers.Text
 {
+    // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2
+    // SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3
+
     /// <summary>
     /// Convert between binary data and UTF-8 encoded text that is represented in base 64.
     /// </summary>
@@ -20,7 +25,7 @@ public static partial class Base64
         /// <param name="utf8">The output span which contains the result of the operation, i.e. the UTF-8 encoded text in base 64.</param>
         /// <param name="bytesConsumed">The number of input bytes consumed during the operation. This can be used to slice the input for subsequent calls, if necessary.</param>
         /// <param name="bytesWritten">The number of bytes written into the output span. This can be used to slice the output for subsequent calls, if necessary.</param>
-        /// <param name="isFinalBlock">True (default) when the input span contains the entire data to encode. 
+        /// <param name="isFinalBlock">True (default) when the input span contains the entire data to encode.
         /// Set to false only if it is known that the input span contains partial data with more data to follow.</param>
         /// <returns>It returns the OperationStatus enum values:
         /// - Done - on successful processing of the entire input span
@@ -28,72 +33,105 @@ public static partial class Base64
         /// - NeedMoreData - only if isFinalBlock is false, otherwise the output is padded if the input is not a multiple of 3
         /// It does not return InvalidData since that is not possible for base 64 encoding.
         /// </returns>
-        public static OperationStatus EncodeToUtf8(ReadOnlySpan<byte> bytes, Span<byte> utf8, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
+        public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan<byte> bytes, Span<byte> utf8, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
         {
-            ref byte srcBytes = ref MemoryMarshal.GetReference(bytes);
-            ref byte destBytes = ref MemoryMarshal.GetReference(utf8);
-
-            int srcLength = bytes.Length;
-            int destLength = utf8.Length;
-
-            int maxSrcLength = 0;
-            if (srcLength <= MaximumEncodeLength && destLength >= GetMaxEncodedToUtf8Length(srcLength))
+            if (bytes.IsEmpty)
             {
-                maxSrcLength = srcLength - 2;
+                bytesConsumed = 0;
+                bytesWritten = 0;
+                return OperationStatus.Done;
             }
-            else
+
+            fixed (byte* srcBytes = &MemoryMarshal.GetReference(bytes))
+            fixed (byte* destBytes = &MemoryMarshal.GetReference(utf8))
             {
-                maxSrcLength = (destLength >> 2) * 3 - 2;
-            }
+                int srcLength = bytes.Length;
+                int destLength = utf8.Length;
+                int maxSrcLength;
 
-            int sourceIndex = 0;
-            int destIndex = 0;
-            int result = 0;
+                if (srcLength <= MaximumEncodeLength && destLength >= GetMaxEncodedToUtf8Length(srcLength))
+                {
+                    maxSrcLength = srcLength;
+                }
+                else
+                {
+                    maxSrcLength = (destLength >> 2) * 3;
+                }
 
-            ref byte encodingMap = ref s_encodingMap[0];
+                byte* src = srcBytes;
+                byte* dest = destBytes;
+                byte* srcEnd = srcBytes + (uint)srcLength;
+                byte* srcMax = srcBytes + (uint)maxSrcLength;
 
-            while (sourceIndex < maxSrcLength)
-            {
-                result = Encode(ref Unsafe.Add(ref srcBytes, sourceIndex), ref encodingMap);
-                Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, destIndex), result);
-                destIndex += 4;
-                sourceIndex += 3;
-            }
+                if (maxSrcLength >= 16)
+                {
+                    byte* end = srcMax - 32;
+                    if (Avx2.IsSupported && (end >= src))
+                    {
+                        Avx2Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
+
+                        if (src == srcEnd)
+                            goto DoneExit;
+                    }
+
+                    end = srcMax - 16;
+                    if (Ssse3.IsSupported && (end >= src))
+                    {
+                        Ssse3Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
+
+                        if (src == srcEnd)
+                            goto DoneExit;
+                    }
+                }
 
-            if (maxSrcLength != srcLength - 2)
-                goto DestinationSmallExit;
+                ref byte encodingMap = ref MemoryMarshal.GetReference(s_encodingMap);
+                uint result = 0;
 
-            if (!isFinalBlock)
-                goto NeedMoreDataExit;
+                srcMax -= 2;
+                while (src < srcMax)
+                {
+                    result = Encode(src, ref encodingMap);
+                    Unsafe.WriteUnaligned(dest, result);
+                    src += 3;
+                    dest += 4;
+                }
 
-            if (sourceIndex == srcLength - 1)
-            {
-                result = EncodeAndPadTwo(ref Unsafe.Add(ref srcBytes, sourceIndex), ref encodingMap);
-                Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, destIndex), result);
-                destIndex += 4;
-                sourceIndex += 1;
-            }
-            else if (sourceIndex == srcLength - 2)
-            {
-                result = EncodeAndPadOne(ref Unsafe.Add(ref srcBytes, sourceIndex), ref encodingMap);
-                Unsafe.WriteUnaligned(ref Unsafe.Add(ref destBytes, destIndex), result);
-                destIndex += 4;
-                sourceIndex += 2;
-            }
+                if (srcMax + 2 != srcEnd)
+                    goto DestinationTooSmallExit;
 
-            bytesConsumed = sourceIndex;
-            bytesWritten = destIndex;
-            return OperationStatus.Done;
+                if (!isFinalBlock)
+                    goto NeedMoreData;
 
-        NeedMoreDataExit:
-            bytesConsumed = sourceIndex;
-            bytesWritten = destIndex;
-            return OperationStatus.NeedMoreData;
+                if (src + 1 == srcEnd)
+                {
+                    result = EncodeAndPadTwo(src, ref encodingMap);
+                    Unsafe.WriteUnaligned(dest, result);
+                    src += 1;
+                    dest += 4;
+                }
+                else if (src + 2 == srcEnd)
+                {
+                    result = EncodeAndPadOne(src, ref encodingMap);
+                    Unsafe.WriteUnaligned(dest, result);
+                    src += 2;
+                    dest += 4;
+                }
 
-        DestinationSmallExit:
-            bytesConsumed = sourceIndex;
-            bytesWritten = destIndex;
-            return OperationStatus.DestinationTooSmall;
+            DoneExit:
+                bytesConsumed = (int)(src - srcBytes);
+                bytesWritten = (int)(dest - destBytes);
+                return OperationStatus.Done;
+
+            DestinationTooSmallExit:
+                bytesConsumed = (int)(src - srcBytes);
+                bytesWritten = (int)(dest - destBytes);
+                return OperationStatus.DestinationTooSmall;
+
+            NeedMoreData:
+                bytesConsumed = (int)(src - srcBytes);
+                bytesWritten = (int)(dest - destBytes);
+                return OperationStatus.NeedMoreData;
+            }
         }
 
         /// <summary>
@@ -108,16 +146,16 @@ public static int GetMaxEncodedToUtf8Length(int length)
             if ((uint)length > MaximumEncodeLength)
                 ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.length);
 
-            return (((length + 2) / 3) * 4);
+            return ((length + 2) / 3) * 4;
         }
 
         /// <summary>
-        /// Encode the span of binary data (in-place) into UTF-8 encoded text represented as base 64. 
+        /// Encode the span of binary data (in-place) into UTF-8 encoded text represented as base 64.
         /// The encoded text output is larger than the binary data contained in the input (the operation inflates the data).
         /// </summary>
-        /// <param name="buffer">The input span which contains binary data that needs to be encoded. 
+        /// <param name="buffer">The input span which contains binary data that needs to be encoded.
         /// It needs to be large enough to fit the result of the operation.</param>
-        /// <param name="dataLength">The amount of binary data contained within the buffer that needs to be encoded 
+        /// <param name="dataLength">The amount of binary data contained within the buffer that needs to be encoded
         /// (and needs to be smaller than the buffer length).</param>
         /// <param name="bytesWritten">The number of bytes written into the buffer.</param>
         /// <returns>It returns the OperationStatus enum values:
@@ -126,93 +164,359 @@ public static int GetMaxEncodedToUtf8Length(int length)
         /// It does not return NeedMoreData since this method tramples the data in the buffer and hence can only be called once with all the data in the buffer.
         /// It does not return InvalidData since that is not possible for base 64 encoding.
         /// </returns>
-        public static OperationStatus EncodeToUtf8InPlace(Span<byte> buffer, int dataLength, out int bytesWritten)
+        public static unsafe OperationStatus EncodeToUtf8InPlace(Span<byte> buffer, int dataLength, out int bytesWritten)
         {
-            int encodedLength = GetMaxEncodedToUtf8Length(dataLength);
-            if (buffer.Length < encodedLength)
-                goto FalseExit;
+            if (buffer.IsEmpty)
+            {
+                bytesWritten = 0;
+                return OperationStatus.Done;
+            }
 
-            int leftover = dataLength - (dataLength / 3) * 3; // how many bytes after packs of 3
+            fixed (byte* bufferBytes = &MemoryMarshal.GetReference(buffer))
+            {
+                int encodedLength = GetMaxEncodedToUtf8Length(dataLength);
+                if (buffer.Length < encodedLength)
+                    goto FalseExit;
 
-            int destinationIndex = encodedLength - 4;
-            int sourceIndex = dataLength - leftover;
-            int result = 0;
+                int leftover = dataLength - (dataLength / 3) * 3; // how many bytes after packs of 3
 
-            ref byte encodingMap = ref s_encodingMap[0];
-            ref byte bufferBytes = ref MemoryMarshal.GetReference(buffer);
+                uint destinationIndex = (uint)(encodedLength - 4);
+                uint sourceIndex = (uint)(dataLength - leftover);
+                uint result = 0;
+                ref byte encodingMap = ref MemoryMarshal.GetReference(s_encodingMap);
 
-            // encode last pack to avoid conditional in the main loop
-            if (leftover != 0)
-            {
-                if (leftover == 1)
+                // encode last pack to avoid conditional in the main loop
+                if (leftover != 0)
                 {
-                    result = EncodeAndPadTwo(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref encodingMap);
-                    Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, destinationIndex), result);
+                    if (leftover == 1)
+                    {
+                        result = EncodeAndPadTwo(bufferBytes + sourceIndex, ref encodingMap);
+                    }
+                    else
+                    {
+                        result = EncodeAndPadOne(bufferBytes + sourceIndex, ref encodingMap);
+                    }
+
+                    Unsafe.WriteUnaligned(bufferBytes + destinationIndex, result);
                     destinationIndex -= 4;
                 }
-                else
+
+                sourceIndex -= 3;
+                while ((int)sourceIndex >= 0)
                 {
-                    result = EncodeAndPadOne(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref encodingMap);
-                    Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, destinationIndex), result);
+                    result = Encode(bufferBytes + sourceIndex, ref encodingMap);
+                    Unsafe.WriteUnaligned(bufferBytes + destinationIndex, result);
                     destinationIndex -= 4;
+                    sourceIndex -= 3;
                 }
+
+                bytesWritten = encodedLength;
+                return OperationStatus.Done;
+
+            FalseExit:
+                bytesWritten = 0;
+                return OperationStatus.DestinationTooSmall;
             }
+        }
 
-            sourceIndex -= 3;
-            while (sourceIndex >= 0)
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
+        {
+            // If we have AVX2 support, pick off 24 bytes at a time for as long as we can.
+            // But because we read 32 bytes at a time, ensure we have enough room to do a
+            // full 32-byte read without segfaulting.
+
+            // translation from SSSE3 into AVX2 of procedure
+            // This one works with shifted (4 bytes) input in order to
+            // be able to work efficiently in the 2 128-bit lanes
+
+            // srcBytes, bytes MSB to LSB:
+            // 0 0 0 0 x w v u t s r q p o n m
+            // l k j i h g f e d c b a 0 0 0 0
+
+            // The JIT won't hoist these "constants", so help it
+            Vector256<sbyte> shuffleVec = ReadVector<Vector256<sbyte>>(s_avxEncodeShuffleVec);
+            Vector256<sbyte> maskAC = Vector256.Create(0x0fc0fc00).AsSByte();
+            Vector256<sbyte> maskBB = Vector256.Create(0x003f03f0).AsSByte();
+            Vector256<ushort> shiftAC = Vector256.Create(0x04000040).AsUInt16();
+            Vector256<short> shiftBB = Vector256.Create(0x01000010).AsInt16();
+            Vector256<byte> const51 = Vector256.Create((byte)51);
+            Vector256<sbyte> const25 = Vector256.Create((sbyte)25);
+            Vector256<sbyte> lut = ReadVector<Vector256<sbyte>>(s_avxEncodeLut);
+
+            byte* src = srcBytes;
+            byte* dest = destBytes;
+
+            // first load is done at c-0 not to get a segfault
+            AssertRead<Vector256<sbyte>>(src, srcStart, sourceLength);
+            Vector256<sbyte> str = Avx.LoadVector256(src).AsSByte();
+
+            // shift by 4 bytes, as required by Reshuffle
+            str = Avx2.PermuteVar8x32(str.AsInt32(), ReadVector<Vector256<sbyte>>(s_avxEncodePermuteVec).AsInt32()).AsSByte();
+
+            // Next loads are done at src-4, as required by Reshuffle, so shift it once
+            src -= 4;
+
+            while (true)
             {
-                result = Encode(ref Unsafe.Add(ref bufferBytes, sourceIndex), ref encodingMap);
-                Unsafe.WriteUnaligned(ref Unsafe.Add(ref bufferBytes, destinationIndex), result);
-                destinationIndex -= 4;
-                sourceIndex -= 3;
+                // Reshuffle
+                str = Avx2.Shuffle(str, shuffleVec);
+                // str, bytes MSB to LSB:
+                // w x v w
+                // t u s t
+                // q r p q
+                // n o m n
+                // k l j k
+                // h i g h
+                // e f d e
+                // b c a b
+
+                Vector256<sbyte> t0 = Avx2.And(str, maskAC);
+                // bits, upper case are most significant bits, lower case are least significant bits.
+                // 0000wwww XX000000 VVVVVV00 00000000
+                // 0000tttt UU000000 SSSSSS00 00000000
+                // 0000qqqq RR000000 PPPPPP00 00000000
+                // 0000nnnn OO000000 MMMMMM00 00000000
+                // 0000kkkk LL000000 JJJJJJ00 00000000
+                // 0000hhhh II000000 GGGGGG00 00000000
+                // 0000eeee FF000000 DDDDDD00 00000000
+                // 0000bbbb CC000000 AAAAAA00 00000000
+
+                Vector256<sbyte> t2 = Avx2.And(str, maskBB);
+                // 00000000 00xxxxxx 000000vv WWWW0000
+                // 00000000 00uuuuuu 000000ss TTTT0000
+                // 00000000 00rrrrrr 000000pp QQQQ0000
+                // 00000000 00oooooo 000000mm NNNN0000
+                // 00000000 00llllll 000000jj KKKK0000
+                // 00000000 00iiiiii 000000gg HHHH0000
+                // 00000000 00ffffff 000000dd EEEE0000
+                // 00000000 00cccccc 000000aa BBBB0000
+
+                Vector256<ushort> t1 = Avx2.MultiplyHigh(t0.AsUInt16(), shiftAC);
+                // 00000000 00wwwwXX 00000000 00VVVVVV
+                // 00000000 00ttttUU 00000000 00SSSSSS
+                // 00000000 00qqqqRR 00000000 00PPPPPP
+                // 00000000 00nnnnOO 00000000 00MMMMMM
+                // 00000000 00kkkkLL 00000000 00JJJJJJ
+                // 00000000 00hhhhII 00000000 00GGGGGG
+                // 00000000 00eeeeFF 00000000 00DDDDDD
+                // 00000000 00bbbbCC 00000000 00AAAAAA
+
+                Vector256<short> t3 = Avx2.MultiplyLow(t2.AsInt16(), shiftBB);
+                // 00xxxxxx 00000000 00vvWWWW 00000000
+                // 00uuuuuu 00000000 00ssTTTT 00000000
+                // 00rrrrrr 00000000 00ppQQQQ 00000000
+                // 00oooooo 00000000 00mmNNNN 00000000
+                // 00llllll 00000000 00jjKKKK 00000000
+                // 00iiiiii 00000000 00ggHHHH 00000000
+                // 00ffffff 00000000 00ddEEEE 00000000
+                // 00cccccc 00000000 00aaBBBB 00000000
+
+                str = Avx2.Or(t1.AsSByte(), t3.AsSByte());
+                // 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
+                // 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
+                // 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
+                // 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
+                // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+                // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+                // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+                // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+                // Translation
+                // LUT contains Absolute offset for all ranges:
+                // Translate values 0..63 to the Base64 alphabet. There are five sets:
+                // #  From      To         Abs    Index  Characters
+                // 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
+                // 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
+                // 2  [52..61]  [48..57]    -4  [2..11]  0123456789
+                // 3  [62]      [43]       -19       12  +
+                // 4  [63]      [47]       -16       13  /
+
+                // Create LUT indices from input:
+                // the index for range #0 is right, others are 1 less than expected:
+                Vector256<byte> indices = Avx2.SubtractSaturate(str.AsByte(), const51);
+
+                // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+                Vector256<sbyte> mask = Avx2.CompareGreaterThan(str, const25);
+
+                // substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
+                Vector256<sbyte> tmp = Avx2.Subtract(indices.AsSByte(), mask);
+
+                // Add offsets to input values:
+                str = Avx2.Add(str, Avx2.Shuffle(lut, tmp));
+
+                AssertWrite<Vector256<sbyte>>(dest, destStart, destLength);
+                Avx.Store(dest, str.AsByte());
+
+                src += 24;
+                dest += 32;
+
+                if (src > srcEnd)
+                    break;
+
+                // Load at src-4, as required by Reshuffle (already shifted by -4)
+                AssertRead<Vector256<sbyte>>(src, srcStart, sourceLength);
+                str = Avx.LoadVector256(src).AsSByte();
             }
 
-            bytesWritten = encodedLength;
-            return OperationStatus.Done;
+            srcBytes = src + 4;
+            destBytes = dest;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
+        {
+            // If we have SSSE3 support, pick off 12 bytes at a time for as long as we can.
+            // But because we read 16 bytes at a time, ensure we have enough room to do a
+            // full 16-byte read without segfaulting.
+
+            // srcBytes, bytes MSB to LSB:
+            // 0 0 0 0 l k j i h g f e d c b a
+
+            // The JIT won't hoist these "constants", so help it
+            Vector128<sbyte> shuffleVec = ReadVector<Vector128<sbyte>>(s_sseEncodeShuffleVec);
+            Vector128<sbyte> maskAC = Vector128.Create(0x0fc0fc00).AsSByte();
+            Vector128<sbyte> maskBB = Vector128.Create(0x003f03f0).AsSByte();
+            Vector128<ushort> shiftAC = Vector128.Create(0x04000040).AsUInt16();
+            Vector128<short> shiftBB = Vector128.Create(0x01000010).AsInt16();
+            Vector128<byte> const51 = Vector128.Create((byte)51);
+            Vector128<sbyte> const25 = Vector128.Create((sbyte)25);
+            Vector128<sbyte> lut = ReadVector<Vector128<sbyte>>(s_sseEncodeLut);
+
+            byte* src = srcBytes;
+            byte* dest = destBytes;
+
+            //while (remaining >= 16)
+            do
+            {
+                AssertRead<Vector128<sbyte>>(src, srcStart, sourceLength);
+                Vector128<sbyte> str = Sse2.LoadVector128(src).AsSByte();
+
+                // Reshuffle
+                str = Ssse3.Shuffle(str, shuffleVec);
+                // str, bytes MSB to LSB:
+                // k l j k
+                // h i g h
+                // e f d e
+                // b c a b
+
+                Vector128<sbyte> t0 = Sse2.And(str, maskAC);
+                // bits, upper case are most significant bits, lower case are least significant bits
+                // 0000kkkk LL000000 JJJJJJ00 00000000
+                // 0000hhhh II000000 GGGGGG00 00000000
+                // 0000eeee FF000000 DDDDDD00 00000000
+                // 0000bbbb CC000000 AAAAAA00 00000000
+
+                Vector128<sbyte> t2 = Sse2.And(str, maskBB);
+                // 00000000 00llllll 000000jj KKKK0000
+                // 00000000 00iiiiii 000000gg HHHH0000
+                // 00000000 00ffffff 000000dd EEEE0000
+                // 00000000 00cccccc 000000aa BBBB0000
+
+                Vector128<ushort> t1 = Sse2.MultiplyHigh(t0.AsUInt16(), shiftAC);
+                // 00000000 00kkkkLL 00000000 00JJJJJJ
+                // 00000000 00hhhhII 00000000 00GGGGGG
+                // 00000000 00eeeeFF 00000000 00DDDDDD
+                // 00000000 00bbbbCC 00000000 00AAAAAA
+
+                Vector128<short> t3 = Sse2.MultiplyLow(t2.AsInt16(), shiftBB);
+                // 00llllll 00000000 00jjKKKK 00000000
+                // 00iiiiii 00000000 00ggHHHH 00000000
+                // 00ffffff 00000000 00ddEEEE 00000000
+                // 00cccccc 00000000 00aaBBBB 00000000
+
+                str = Sse2.Or(t1.AsSByte(), t3.AsSByte());
+                // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+                // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+                // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+                // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+                // Translation
+                // LUT contains Absolute offset for all ranges:
+                // Translate values 0..63 to the Base64 alphabet. There are five sets:
+                // #  From      To         Abs    Index  Characters
+                // 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
+                // 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
+                // 2  [52..61]  [48..57]    -4  [2..11]  0123456789
+                // 3  [62]      [43]       -19       12  +
+                // 4  [63]      [47]       -16       13  /
+
+                // Create LUT indices from input:
+                // the index for range #0 is right, others are 1 less than expected:
+                Vector128<byte> indices = Sse2.SubtractSaturate(str.AsByte(), const51);
+
+                // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+                Vector128<sbyte> mask = Sse2.CompareGreaterThan(str, const25);
+
+                // substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
+                Vector128<sbyte> tmp = Sse2.Subtract(indices.AsSByte(), mask);
+
+                // Add offsets to input values:
+                str = Sse2.Add(str, Ssse3.Shuffle(lut, tmp));
+
+                AssertWrite<Vector128<sbyte>>(dest, destStart, destLength);
+                Sse2.Store(dest, str.AsByte());
+
+                src += 12;
+                dest += 16;
+            }
+            while (src <= srcEnd);
 
-        FalseExit:
-            bytesWritten = 0;
-            return OperationStatus.DestinationTooSmall;
+            srcBytes = src;
+            destBytes = dest;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int Encode(ref byte threeBytes, ref byte encodingMap)
+        private static unsafe uint Encode(byte* threeBytes, ref byte encodingMap)
         {
-            int i = (threeBytes << 16) | (Unsafe.Add(ref threeBytes, 1) << 8) | Unsafe.Add(ref threeBytes, 2);
+            uint t0 = threeBytes[0];
+            uint t1 = threeBytes[1];
+            uint t2 = threeBytes[2];
 
-            int i0 = Unsafe.Add(ref encodingMap, i >> 18);
-            int i1 = Unsafe.Add(ref encodingMap, (i >> 12) & 0x3F);
-            int i2 = Unsafe.Add(ref encodingMap, (i >> 6) & 0x3F);
-            int i3 = Unsafe.Add(ref encodingMap, i & 0x3F);
+            uint i = (t0 << 16) | (t1 << 8) | t2;
+
+            uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18));
+            uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F));
+            uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F));
+            uint i3 = Unsafe.Add(ref encodingMap, (IntPtr)(i & 0x3F));
 
             return i0 | (i1 << 8) | (i2 << 16) | (i3 << 24);
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int EncodeAndPadOne(ref byte twoBytes, ref byte encodingMap)
+        private static unsafe uint EncodeAndPadOne(byte* twoBytes, ref byte encodingMap)
         {
-            int i = (twoBytes << 16) | (Unsafe.Add(ref twoBytes, 1) << 8);
+            uint t0 = twoBytes[0];
+            uint t1 = twoBytes[1];
+
+            uint i = (t0 << 16) | (t1 << 8);
 
-            int i0 = Unsafe.Add(ref encodingMap, i >> 18);
-            int i1 = Unsafe.Add(ref encodingMap, (i >> 12) & 0x3F);
-            int i2 = Unsafe.Add(ref encodingMap, (i >> 6) & 0x3F);
+            uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18));
+            uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F));
+            uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F));
 
             return i0 | (i1 << 8) | (i2 << 16) | (EncodingPad << 24);
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int EncodeAndPadTwo(ref byte oneByte, ref byte encodingMap)
+        private static unsafe uint EncodeAndPadTwo(byte* oneByte, ref byte encodingMap)
         {
-            int i = (oneByte << 8);
+            uint t0 = oneByte[0];
 
-            int i0 = Unsafe.Add(ref encodingMap, i >> 10);
-            int i1 = Unsafe.Add(ref encodingMap, (i >> 4) & 0x3F);
+            uint i = t0 << 8;
+
+            uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 10));
+            uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 4) & 0x3F));
 
             return i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24);
         }
 
+        private const uint EncodingPad = '='; // '=', for padding
+
+        private const int MaximumEncodeLength = (int.MaxValue / 4) * 3; // 1610612733
+
         // Pre-computing this table using a custom string(s_characters) and GenerateEncodingMapAndVerify (found in tests)
-        private static readonly byte[] s_encodingMap = {
+        private static ReadOnlySpan<byte> s_encodingMap => new byte[] {
             65, 66, 67, 68, 69, 70, 71, 72,         //A..H
             73, 74, 75, 76, 77, 78, 79, 80,         //I..P
             81, 82, 83, 84, 85, 86, 87, 88,         //Q..X
@@ -223,8 +527,51 @@ private static int EncodeAndPadTwo(ref byte oneByte, ref byte encodingMap)
             52, 53, 54, 55, 56, 57, 43, 47          //4..9, +, /
         };
 
-        private const byte EncodingPad = (byte)'='; // '=', for padding
+        private static ReadOnlySpan<sbyte> s_sseEncodeShuffleVec => new sbyte[] {
+            1, 0, 2, 1,
+            4, 3, 5, 4,
+            7, 6, 8, 7,
+            10, 9, 11, 10
+        };
 
-        private const int MaximumEncodeLength = (int.MaxValue / 4) * 3; // 1610612733
+        private static ReadOnlySpan<sbyte> s_sseEncodeLut => new sbyte[] {
+            65, 71, -4, -4,
+            -4, -4, -4, -4,
+            -4, -4, -4, -4,
+            -19, -16, 0, 0
+        };
+
+        private static ReadOnlySpan<sbyte> s_avxEncodePermuteVec => new sbyte[] {
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            1, 0, 0, 0,
+            2, 0, 0, 0,
+            3, 0, 0, 0,
+            4, 0, 0, 0,
+            5, 0, 0, 0,
+            6, 0, 0, 0
+        };
+
+        private static ReadOnlySpan<sbyte> s_avxEncodeShuffleVec => new sbyte[] {
+            5, 4, 6, 5,
+            8, 7, 9, 8,
+            11, 10, 12, 11,
+            14, 13, 15, 14,
+            1, 0, 2, 1,
+            4, 3, 5, 4,
+            7, 6, 8, 7,
+            10, 9, 11, 10
+        };
+
+        private static ReadOnlySpan<sbyte> s_avxEncodeLut => new sbyte[] {
+            65, 71, -4, -4,
+            -4, -4, -4, -4,
+            -4, -4, -4, -4,
+            -19, -16, 0, 0,
+            65, 71, -4, -4,
+            -4, -4, -4, -4,
+            -4, -4, -4, -4,
+            -19, -16, 0, 0
+        };
     }
 }
diff --git a/src/System.Memory/tests/Base64/Base64DecoderUnitTests.cs b/src/System.Memory/tests/Base64/Base64DecoderUnitTests.cs
index 3c159ddee4cf..8d6de7a7a0a6 100644
--- a/src/System.Memory/tests/Base64/Base64DecoderUnitTests.cs
+++ b/src/System.Memory/tests/Base64/Base64DecoderUnitTests.cs
@@ -45,6 +45,20 @@ public void DecodeEmptySpan()
             Assert.True(Base64TestHelper.VerifyDecodingCorrectness(source.Length, decodedBytes.Length, source, decodedBytes));
         }
 
+        [Fact]
+        public void DecodeGuid()
+        {
+            Span<byte> source = new byte[24];
+            Span<byte> decodedBytes = Guid.NewGuid().ToByteArray();
+            Base64.EncodeToUtf8(decodedBytes, source, out int _, out int _);
+
+            Assert.Equal(OperationStatus.Done,
+                Base64.DecodeFromUtf8(source, decodedBytes, out int consumed, out int decodedByteCount));
+            Assert.Equal(24, consumed);
+            Assert.Equal(16, decodedByteCount);
+            Assert.True(Base64TestHelper.VerifyDecodingCorrectness(source.Length, decodedBytes.Length, source, decodedBytes));
+        }
+
         [Fact]
         public void BasicDecodingWithFinalBlockFalse()
         {