From 56059b65a555c8cc41d0a3653d78d4f6137e089f Mon Sep 17 00:00:00 2001
From: Miha Zupan <mihazupan.zupan1@gmail.com>
Date: Wed, 30 Nov 2022 05:43:36 +0100
Subject: [PATCH] Add AVX2 support to IndexOfAnyValues

---
 .../tests/Span/IndexOfAnyValues.cs            |   2 +-
 .../IndexOfAnyAsciiSearcher.cs                | 668 +++++++++++++++---
 2 files changed, 577 insertions(+), 93 deletions(-)
diff --git a/src/libraries/System.Memory/tests/Span/IndexOfAnyValues.cs b/src/libraries/System.Memory/tests/Span/IndexOfAnyValues.cs
index 8a4266d76cb36..511330b0d51c9 100644
--- a/src/libraries/System.Memory/tests/Span/IndexOfAnyValues.cs
+++ b/src/libraries/System.Memory/tests/Span/IndexOfAnyValues.cs
@@ -360,7 +360,7 @@ static int LastIndexOfAnyExceptReferenceImpl(ReadOnlySpan<char> searchSpace, Rea
         private static class IndexOfAnyValuesTestHelper
         {
             private const int MaxNeedleLength = 10;
-            private const int MaxHaystackLength = 40;
+            private const int MaxHaystackLength = 100;
 
             private static readonly char[] s_randomAsciiChars;
             private static readonly char[] s_randomLatin1Chars;
diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs
index dccbddb6676ff..85ef35e7046a0 100644
--- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs
@@ -1,6 +1,7 @@
 ﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Buffers.Binary;
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
@@ -171,28 +172,83 @@ internal static int IndexOfAnyVectorized<TNegator, TOptimizations>(ref short sea
 
             if (searchSpaceLength > 2 * Vector128<short>.Count)
             {
-                // Process the input in chunks of 16 characters (2 * Vector128<short>).
-                // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128<byte>.
-                // As packing two Vector128<short>s into a Vector128<byte> is cheap compared to the lookup, we can effectively double the throughput.
-                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                // Let the fallback below handle it instead. This is why the condition is
-                // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector128<short>.Count));
-
-                do
+                if (Avx2.IsSupported)
                 {
-                    Vector128<short> source0 = Vector128.LoadUnsafe(ref currentSearchSpace);
-                    Vector128<short> source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128<short>.Count);
+                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
 
-                    Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap);
-                    if (result != Vector128<byte>.Zero)
+                    if (searchSpaceLength > 2 * Vector256<short>.Count)
                     {
-                        return ComputeFirstIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        // Process the input in chunks of 32 characters (2 * Vector256<short>).
+                        // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
+                        // As packing two Vector256<short>s into a Vector256<byte> is cheap compared to the lookup, we can effectively double the throughput.
+                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                        // Let the fallback below handle it instead. This is why the condition is
+                        // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                        ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector256<short>.Count));
+
+                        do
+                        {
+                            Vector256<short> source0 = Vector256.LoadUnsafe(ref currentSearchSpace);
+                            Vector256<short> source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256<short>.Count);
+
+                            Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
+                            if (result != Vector256<byte>.Zero)
+                            {
+                                return ComputeFirstIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                            }
+
+                            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256<short>.Count);
+                        }
+                        while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
                     }
 
-                    currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector128<short>.Count);
+                    // We have 1-32 characters remaining. Process the first and last vector in the search space.
+                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                    Debug.Assert(searchSpaceLength >= Vector256<short>.Count, "We expect that the input is long enough for us to load a whole vector.");
+                    {
+                        ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<short>.Count);
+
+                        ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)
+                            ? ref oneVectorAwayFromEnd
+                            : ref currentSearchSpace;
+
+                        Vector256<short> source0 = Vector256.LoadUnsafe(ref firstVector);
+                        Vector256<short> source1 = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd);
+
+                        Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
+                        if (result != Vector256<byte>.Zero)
+                        {
+                            return ComputeFirstIndexOverlapped<short, TNegator>(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result);
+                        }
+                    }
+
+                    return -1;
+                }
+                else
+                {
+                    // Process the input in chunks of 16 characters (2 * Vector128<short>).
+                    // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128<byte>.
+                    // As packing two Vector128<short>s into a Vector128<byte> is cheap compared to the lookup, we can effectively double the throughput.
+                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                    ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector128<short>.Count));
+
+                    do
+                    {
+                        Vector128<short> source0 = Vector128.LoadUnsafe(ref currentSearchSpace);
+                        Vector128<short> source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128<short>.Count);
+
+                        Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap);
+                        if (result != Vector128<byte>.Zero)
+                        {
+                            return ComputeFirstIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        }
+
+                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector128<short>.Count);
+                    }
+                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
                 }
-                while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
             }
 
             // We have 1-16 characters remaining. Process the first and last vector in the search space.
@@ -226,28 +282,83 @@ internal static int LastIndexOfAnyVectorized<TNegator, TOptimizations>(ref short
 
             if (searchSpaceLength > 2 * Vector128<short>.Count)
             {
-                // Process the input in chunks of 16 characters (2 * Vector128<short>).
-                // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128<byte>.
-                // As packing two Vector128<short>s into a Vector128<byte> is cheap compared to the lookup, we can effectively double the throughput.
-                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                // Let the fallback below handle it instead. This is why the condition is
-                // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector128<short>.Count);
-
-                do
+                if (Avx2.IsSupported)
                 {
-                    currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector128<short>.Count);
+                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
+
+                    if (searchSpaceLength > 2 * Vector256<short>.Count)
+                    {
+                        // Process the input in chunks of 32 characters (2 * Vector256<short>).
+                        // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
+                        // As packing two Vector256<short>s into a Vector256<byte> is cheap compared to the lookup, we can effectively double the throughput.
+                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                        // Let the fallback below handle it instead. This is why the condition is
+                        // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                        ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector256<short>.Count);
+
+                        do
+                        {
+                            currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector256<short>.Count);
+
+                            Vector256<short> source0 = Vector256.LoadUnsafe(ref currentSearchSpace);
+                            Vector256<short> source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256<short>.Count);
+
+                            Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
+                            if (result != Vector256<byte>.Zero)
+                            {
+                                return ComputeLastIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                            }
+                        }
+                        while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart));
+                    }
+
+                    // We have 1-32 characters remaining. Process the first and last vector in the search space.
+                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                    Debug.Assert(searchSpaceLength >= Vector256<short>.Count, "We expect that the input is long enough for us to load a whole vector.");
+                    {
+                        ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<short>.Count);
 
-                    Vector128<short> source0 = Vector128.LoadUnsafe(ref currentSearchSpace);
-                    Vector128<short> source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128<short>.Count);
+                        ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart)
+                            ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256<short>.Count)
+                            : ref searchSpace;
 
-                    Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap);
-                    if (result != Vector128<byte>.Zero)
+                        Vector256<short> source0 = Vector256.LoadUnsafe(ref searchSpace);
+                        Vector256<short> source1 = Vector256.LoadUnsafe(ref secondVector);
+
+                        Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap256);
+                        if (result != Vector256<byte>.Zero)
+                        {
+                            return ComputeLastIndexOverlapped<short, TNegator>(ref searchSpace, ref secondVector, result);
+                        }
+                    }
+
+                    return -1;
+                }
+                else
+                {
+                    // Process the input in chunks of 16 characters (2 * Vector128<short>).
+                    // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128<byte>.
+                    // As packing two Vector128<short>s into a Vector128<byte> is cheap compared to the lookup, we can effectively double the throughput.
+                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                    ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector128<short>.Count);
+
+                    do
                     {
-                        return ComputeLastIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector128<short>.Count);
+
+                        Vector128<short> source0 = Vector128.LoadUnsafe(ref currentSearchSpace);
+                        Vector128<short> source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128<short>.Count);
+
+                        Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, bitmap);
+                        if (result != Vector128<byte>.Zero)
+                        {
+                            return ComputeLastIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        }
                     }
+                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart));
                 }
-                while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart));
             }
 
             // We have 1-16 characters remaining. Process the first and last vector in the search space.
@@ -281,32 +392,85 @@ internal static int IndexOfAnyVectorized<TNegator, TOptimizations>(ref byte sear
 
             if (searchSpaceLength > Vector128<byte>.Count)
             {
-                // Process the input in chunks of 16 bytes.
-                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                // Let the fallback below handle it instead. This is why the condition is
-                // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
-
-                do
+                if (Avx2.IsSupported)
                 {
-                    Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
+
+                    if (searchSpaceLength > Vector256<byte>.Count)
+                    {
+                        // Process the input in chunks of 32 bytes.
+                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                        // Let the fallback below handle it instead. This is why the condition is
+                        // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                        ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<byte>.Count);
+
+                        do
+                        {
+                            Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
+
+                            Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source, bitmap256);
+                            if (result != Vector256<byte>.Zero)
+                            {
+                                return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                            }
+
+                            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<byte>.Count);
+                        }
+                        while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
+                    }
 
-                    Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source, bitmap);
-                    if (result != Vector128<byte>.Zero)
+                    // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
+                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                    Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
                     {
-                        return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
+
+                        ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd)
+                            ? ref halfVectorAwayFromEnd
+                            : ref currentSearchSpace;
+
+                        Vector128<byte> source0 = Vector128.LoadUnsafe(ref firstVector);
+                        Vector128<byte> source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd);
+                        Vector256<byte> source = Vector256.Create(source0, source1);
+
+                        Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source, bitmap256);
+                        if (result != Vector256<byte>.Zero)
+                        {
+                            return ComputeFirstIndexOverlapped<byte, TNegator>(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result);
+                        }
                     }
 
-                    currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
+                    return -1;
+                }
+                else
+                {
+                    // Process the input in chunks of 16 bytes.
+                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                    ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
+
+                    do
+                    {
+                        Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+
+                        Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source, bitmap);
+                        if (result != Vector128<byte>.Zero)
+                        {
+                            return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        }
+
+                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
+                    }
+                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
                 }
-                while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
             }
 
             // We have 1-16 bytes remaining. Process the first and last half vectors in the search space.
             // They may overlap, but we'll handle that in the index calculation if we do get a match.
             Debug.Assert(searchSpaceLength >= sizeof(ulong), "We expect that the input is long enough for us to load a ulong.");
             {
-                ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count / 2);
+                ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - sizeof(ulong));
 
                 ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd)
                     ? ref halfVectorAwayFromEnd
@@ -334,35 +498,88 @@ internal static int LastIndexOfAnyVectorized<TNegator, TOptimizations>(ref byte
 
             if (searchSpaceLength > Vector128<byte>.Count)
             {
-                // Process the input in chunks of 16 bytes.
-                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                // Let the fallback below handle it instead. This is why the condition is
-                // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
-
-                do
+                if (Avx2.IsSupported)
                 {
-                    currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count);
+                    Vector256<byte> bitmap256 = Vector256.Create(bitmap, bitmap);
 
-                    Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+                    if (searchSpaceLength > Vector256<byte>.Count)
+                    {
+                        // Process the input in chunks of 32 bytes.
+                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                        // Let the fallback below handle it instead. This is why the condition is
+                        // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                        ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<byte>.Count);
+
+                        do
+                        {
+                            currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256<byte>.Count);
+
+                            Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
+
+                            Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source, bitmap256);
+                            if (result != Vector256<byte>.Zero)
+                            {
+                                return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                            }
+                        }
+                        while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
+                    }
 
-                    Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source, bitmap);
-                    if (result != Vector128<byte>.Zero)
+                    // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
+                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                    Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
                     {
-                        return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+
+                        ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart)
+                            ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count)
+                            : ref searchSpace;
+
+                        Vector128<byte> source0 = Vector128.LoadUnsafe(ref searchSpace);
+                        Vector128<byte> source1 = Vector128.LoadUnsafe(ref secondVector);
+                        Vector256<byte> source = Vector256.Create(source0, source1);
+
+                        Vector256<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source, bitmap256);
+                        if (result != Vector256<byte>.Zero)
+                        {
+                            return ComputeLastIndexOverlapped<byte, TNegator>(ref searchSpace, ref secondVector, result);
+                        }
                     }
+
+                    return -1;
+                }
+                else
+                {
+                    // Process the input in chunks of 16 bytes.
+                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                    ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+
+                    do
+                    {
+                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count);
+
+                        Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+
+                        Vector128<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source, bitmap);
+                        if (result != Vector128<byte>.Zero)
+                        {
+                            return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        }
+                    }
+                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
                 }
-                while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
             }
 
             // We have 1-16 bytes remaining. Process the first and last half vectors in the search space.
             // They may overlap, but we'll handle that in the index calculation if we do get a match.
             Debug.Assert(searchSpaceLength >= sizeof(ulong), "We expect that the input is long enough for us to load a ulong.");
             {
-                ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count / 2);
+                ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, sizeof(ulong));
 
                 ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart)
-                    ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128<short>.Count)
+                    ? ref Unsafe.Subtract(ref currentSearchSpace, sizeof(ulong))
                     : ref searchSpace;
 
                 ulong source0 = Unsafe.ReadUnaligned<ulong>(ref searchSpace);
@@ -386,32 +603,86 @@ internal static int IndexOfAnyVectorized<TNegator>(ref byte searchSpace, int sea
 
             if (searchSpaceLength > Vector128<byte>.Count)
             {
-                // Process the input in chunks of 16 bytes.
-                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                // Let the fallback below handle it instead. This is why the condition is
-                // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
-                ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
-
-                do
+                if (Avx2.IsSupported)
                 {
-                    Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+                    Vector256<byte> bitmap256_0 = Vector256.Create(bitmap0, bitmap0);
+                    Vector256<byte> bitmap256_1 = Vector256.Create(bitmap1, bitmap1);
+
+                    if (searchSpaceLength > Vector256<byte>.Count)
+                    {
+                        // Process the input in chunks of 32 bytes.
+                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                        // Let the fallback below handle it instead. This is why the condition is
+                        // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                        ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256<byte>.Count);
+
+                        do
+                        {
+                            Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
+
+                            Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
+                            if (result != Vector256<byte>.Zero)
+                            {
+                                return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                            }
+
+                            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<byte>.Count);
+                        }
+                        while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
+                    }
 
-                    Vector128<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap0, bitmap1);
-                    if (result != Vector128<byte>.Zero)
+                    // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
+                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                    Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
                     {
-                        return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
+
+                        ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd)
+                            ? ref halfVectorAwayFromEnd
+                            : ref currentSearchSpace;
+
+                        Vector128<byte> source0 = Vector128.LoadUnsafe(ref firstVector);
+                        Vector128<byte> source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd);
+                        Vector256<byte> source = Vector256.Create(source0, source1);
+
+                        Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
+                        if (result != Vector256<byte>.Zero)
+                        {
+                            return ComputeFirstIndexOverlapped<byte, TNegator>(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result);
+                        }
                     }
 
-                    currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
+                    return -1;
+                }
+                else
+                {
+                    // Process the input in chunks of 16 bytes.
+                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+                    ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count);
+
+                    do
+                    {
+                        Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+
+                        Vector128<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap0, bitmap1);
+                        if (result != Vector128<byte>.Zero)
+                        {
+                            return ComputeFirstIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        }
+
+                        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<byte>.Count);
+                    }
+                    while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
                 }
-                while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd));
             }
 
             // We have 1-16 bytes remaining. Process the first and last half vectors in the search space.
             // They may overlap, but we'll handle that in the index calculation if we do get a match.
             Debug.Assert(searchSpaceLength >= sizeof(ulong), "We expect that the input is long enough for us to load a ulong.");
             {
-                ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128<byte>.Count / 2);
+                ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - sizeof(ulong));
 
                 ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd)
                     ? ref halfVectorAwayFromEnd
@@ -438,35 +709,89 @@ internal static int LastIndexOfAnyVectorized<TNegator>(ref byte searchSpace, int
 
             if (searchSpaceLength > Vector128<byte>.Count)
             {
-                // Process the input in chunks of 16 bytes.
-                // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
-                // Let the fallback below handle it instead. This is why the condition is
-                // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
-                ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
-
-                do
+                if (Avx2.IsSupported)
                 {
-                    currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count);
+                    Vector256<byte> bitmap256_0 = Vector256.Create(bitmap0, bitmap0);
+                    Vector256<byte> bitmap256_1 = Vector256.Create(bitmap1, bitmap1);
 
-                    Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+                    if (searchSpaceLength > Vector256<byte>.Count)
+                    {
+                        // Process the input in chunks of 32 bytes.
+                        // If the input length is a multiple of 32, don't consume the last 32 characters in this loop.
+                        // Let the fallback below handle it instead. This is why the condition is
+                        // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                        ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256<byte>.Count);
+
+                        do
+                        {
+                            currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256<byte>.Count);
+
+                            Vector256<byte> source = Vector256.LoadUnsafe(ref currentSearchSpace);
+
+                            Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
+                            if (result != Vector256<byte>.Zero)
+                            {
+                                return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                            }
+                        }
+                        while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
+                    }
 
-                    Vector128<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap0, bitmap1);
-                    if (result != Vector128<byte>.Zero)
+                    // We have 1-32 bytes remaining. Process the first and last half vectors in the search space.
+                    // They may overlap, but we'll handle that in the index calculation if we do get a match.
+                    Debug.Assert(searchSpaceLength >= Vector128<byte>.Count, "We expect that the input is long enough for us to load a Vector128.");
                     {
-                        return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+
+                        ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart)
+                            ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count)
+                            : ref searchSpace;
+
+                        Vector128<byte> source0 = Vector128.LoadUnsafe(ref searchSpace);
+                        Vector128<byte> source1 = Vector128.LoadUnsafe(ref secondVector);
+                        Vector256<byte> source = Vector256.Create(source0, source1);
+
+                        Vector256<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap256_0, bitmap256_1);
+                        if (result != Vector256<byte>.Zero)
+                        {
+                            return ComputeLastIndexOverlapped<byte, TNegator>(ref searchSpace, ref secondVector, result);
+                        }
                     }
+
+                    return -1;
+                }
+                else
+                {
+                    // Process the input in chunks of 16 bytes.
+                    // If the input length is a multiple of 16, don't consume the last 16 characters in this loop.
+                    // Let the fallback below handle it instead. This is why the condition is
+                    // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
+                    ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count);
+
+                    do
+                    {
+                        currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<byte>.Count);
+
+                        Vector128<byte> source = Vector128.LoadUnsafe(ref currentSearchSpace);
+
+                        Vector128<byte> result = IndexOfAnyLookup<TNegator>(source, bitmap0, bitmap1);
+                        if (result != Vector128<byte>.Zero)
+                        {
+                            return ComputeLastIndex<byte, TNegator>(ref searchSpace, ref currentSearchSpace, result);
+                        }
+                    }
+                    while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
                 }
-                while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart));
             }
 
             // We have 1-16 bytes remaining. Process the first and last half vectors in the search space.
             // They may overlap, but we'll handle that in the index calculation if we do get a match.
             Debug.Assert(searchSpaceLength >= sizeof(ulong), "We expect that the input is long enough for us to load a ulong.");
             {
-                ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128<byte>.Count / 2);
+                ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, sizeof(ulong));
 
                 ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart)
-                    ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128<short>.Count)
+                    ? ref Unsafe.Subtract(ref currentSearchSpace, sizeof(ulong))
                     : ref searchSpace;
 
                 ulong source0 = Unsafe.ReadUnaligned<ulong>(ref searchSpace);
@@ -558,6 +883,54 @@ private static Vector128<byte> IndexOfAnyLookupCore(Vector128<byte> source, Vect
             return result;
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector256<short> source0, Vector256<short> source1, Vector256<byte> bitmapLookup)
+            where TNegator : struct, INegator
+            where TOptimizations : struct, IOptimizations
+        {
+            // See comments in IndexOfAnyLookup(Vector128<byte>) above for more details.
+            Vector256<byte> source = Avx2.PackUnsignedSaturate(source0, source1);
+            Vector256<byte> result = IndexOfAnyLookupCore(source, bitmapLookup);
+
+            if (TOptimizations.NeedleContainsZero)
+            {
+                Vector256<short> ascii0 = Vector256.LessThan(source0.AsUInt16(), Vector256.Create((ushort)128)).AsInt16();
+                Vector256<short> ascii1 = Vector256.LessThan(source1.AsUInt16(), Vector256.Create((ushort)128)).AsInt16();
+                Vector256<byte> ascii = Avx2.PackSignedSaturate(ascii0, ascii1).AsByte();
+                result &= ascii;
+            }
+
+            return TNegator.NegateIfNeeded(result);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector256<byte> source, Vector256<byte> bitmapLookup)
+            where TNegator : struct, INegator
+            where TOptimizations : struct, IOptimizations
+        {
+            // See comments in IndexOfAnyLookup(Vector128<byte>) above for more details.
+            Vector256<byte> result = IndexOfAnyLookupCore(source, bitmapLookup);
+
+            if (TOptimizations.NeedleContainsZero)
+            {
+                Vector256<byte> ascii = Vector256.LessThan(source, Vector256.Create((byte)128));
+                result &= ascii;
+            }
+
+            return TNegator.NegateIfNeeded(result);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<byte> IndexOfAnyLookupCore(Vector256<byte> source, Vector256<byte> bitmapLookup)
+        {
+            // See comments in IndexOfAnyLookupCore(Vector128<byte>) above for more details.
+            Vector256<byte> highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF);
+            Vector256<byte> bitMask = Avx2.Shuffle(bitmapLookup, source);
+            Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles);
+            Vector256<byte> result = bitMask & bitPositions;
+            return result;
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector128<byte> IndexOfAnyLookup<TNegator>(Vector128<byte> source, Vector128<byte> bitmapLookup0, Vector128<byte> bitmapLookup1)
             where TNegator : struct, INegator
@@ -572,14 +945,36 @@ private static Vector128<byte> IndexOfAnyLookup<TNegator>(Vector128<byte> source
 
             Vector128<byte> bitmask = Shuffle(Vector128.Create(0x8040201008040201).AsByte(), highNibbles);
 
-            Vector128<byte> mask = Vector128.LessThan(highNibbles, Vector128.Create((byte)0x8));
-            Vector128<byte> bitsets = Vector128.ConditionalSelect(mask, row0, row1);
+            Vector128<byte> mask = Vector128.GreaterThan(highNibbles.AsSByte(), Vector128.Create((sbyte)0x7)).AsByte();
+            Vector128<byte> bitsets = Vector128.ConditionalSelect(mask, row1, row0);
 
             Vector128<byte> result = Vector128.Equals(bitsets & bitmask, bitmask);
 
             return TNegator.NegateIfNeeded(result);
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<byte> IndexOfAnyLookup<TNegator>(Vector256<byte> source, Vector256<byte> bitmapLookup0, Vector256<byte> bitmapLookup1)
+            where TNegator : struct, INegator
+        {
+            // http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm
+
+            Vector256<byte> lowNibbles = source & Vector256.Create((byte)0xF);
+            Vector256<byte> highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF);
+
+            Vector256<byte> row0 = Avx2.Shuffle(bitmapLookup0, lowNibbles);
+            Vector256<byte> row1 = Avx2.Shuffle(bitmapLookup1, lowNibbles);
+
+            Vector256<byte> bitmask = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles);
+
+            Vector256<byte> mask = Vector256.GreaterThan(highNibbles.AsSByte(), Vector256.Create((sbyte)0x7)).AsByte();
+            Vector256<byte> bitsets = Vector256.ConditionalSelect(mask, row1, row0);
+
+            Vector256<byte> result = Vector256.Equals(bitsets & bitmask, bitmask);
+
+            return TNegator.NegateIfNeeded(result);
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
         {
@@ -639,18 +1034,105 @@ private static unsafe int ComputeLastIndexOverlapped<T, TNegator>(ref T searchSp
             return offsetInVector - Vector128<short>.Count + (int)(Unsafe.ByteOffset(ref searchSpace, ref secondVector) / sizeof(T));
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe int ComputeFirstIndex<T, TNegator>(ref T searchSpace, ref T current, Vector256<byte> result)
+            where TNegator : struct, INegator
+        {
+            uint mask = TNegator.ExtractMask(result);
+            if (typeof(T) == typeof(short))
+            {
+                mask = FixUpPackedVector256Mask(mask);
+            }
+
+            int offsetInVector = BitOperations.TrailingZeroCount(mask);
+            return offsetInVector + (int)(Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(T));
+        }
+
+#pragma warning disable IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe int ComputeFirstIndexOverlapped<T, TNegator>(ref T searchSpace, ref T current0, ref T current1, Vector256<byte> result)
+            where TNegator : struct, INegator
+        {
+            uint mask = TNegator.ExtractMask(result);
+            if (typeof(T) == typeof(short))
+            {
+                mask = FixUpPackedVector256Mask(mask);
+            }
+
+            int offsetInVector = BitOperations.TrailingZeroCount(mask);
+            if (offsetInVector >= Vector256<short>.Count)
+            {
+                // We matched within the second vector
+                current0 = ref current1;
+                offsetInVector -= Vector256<short>.Count;
+            }
+            return offsetInVector + (int)(Unsafe.ByteOffset(ref searchSpace, ref current0) / sizeof(T));
+        }
+#pragma warning restore IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe int ComputeLastIndex<T, TNegator>(ref T searchSpace, ref T current, Vector256<byte> result)
+            where TNegator : struct, INegator
+        {
+            uint mask = TNegator.ExtractMask(result);
+            if (typeof(T) == typeof(short))
+            {
+                mask = FixUpPackedVector256Mask(mask);
+            }
+
+            int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask);
+            return offsetInVector + (int)(Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(T));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe int ComputeLastIndexOverlapped<T, TNegator>(ref T searchSpace, ref T secondVector, Vector256<byte> result)
+            where TNegator : struct, INegator
+        {
+            uint mask = TNegator.ExtractMask(result);
+            if (typeof(T) == typeof(short))
+            {
+                mask = FixUpPackedVector256Mask(mask);
+            }
+
+            int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask);
+            if (offsetInVector < Vector256<short>.Count)
+            {
+                return offsetInVector;
+            }
+
+            // We matched within the second vector
+            return offsetInVector - Vector256<short>.Count + (int)(Unsafe.ByteOffset(ref searchSpace, ref secondVector) / sizeof(T));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint FixUpPackedVector256Mask(uint mask)
+        {
+            Debug.Assert(Avx2.IsSupported);
+            // Avx2.PackUnsignedSaturate(Vector256.Create((short)1), Vector256.Create((short)2)) will result in
+            // 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
+            // We want to swap the X and Y bits
+            // 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2
+            const uint CorrectPositionsMask = 0xFF0000FF;
+
+            return (mask & CorrectPositionsMask) | BinaryPrimitives.ReverseEndianness(mask & ~CorrectPositionsMask);
+        }
+
         internal interface INegator
         {
             static abstract bool NegateIfNeeded(bool result);
             static abstract Vector128<byte> NegateIfNeeded(Vector128<byte> result);
+            static abstract Vector256<byte> NegateIfNeeded(Vector256<byte> result);
             static abstract uint ExtractMask(Vector128<byte> result);
+            static abstract uint ExtractMask(Vector256<byte> result);
         }
 
         internal readonly struct DontNegate : INegator
         {
             public static bool NegateIfNeeded(bool result) => result;
             public static Vector128<byte> NegateIfNeeded(Vector128<byte> result) => result;
+            public static Vector256<byte> NegateIfNeeded(Vector256<byte> result) => result;
             public static uint ExtractMask(Vector128<byte> result) => ~Vector128.Equals(result, Vector128<byte>.Zero).ExtractMostSignificantBits();
+            public static uint ExtractMask(Vector256<byte> result) => ~Vector256.Equals(result, Vector256<byte>.Zero).ExtractMostSignificantBits();
         }
 
         internal readonly struct Negate : INegator
@@ -659,7 +1141,9 @@ internal interface INegator
             // This is intentionally testing for equality with 0 instead of "~result".
             // We want to know if any character didn't match, as that means it should be treated as a match for the -Except method.
             public static Vector128<byte> NegateIfNeeded(Vector128<byte> result) => Vector128.Equals(result, Vector128<byte>.Zero);
+            public static Vector256<byte> NegateIfNeeded(Vector256<byte> result) => Vector256.Equals(result, Vector256<byte>.Zero);
             public static uint ExtractMask(Vector128<byte> result) => result.ExtractMostSignificantBits();
+            public static uint ExtractMask(Vector256<byte> result) => result.ExtractMostSignificantBits();
         }
 
         internal interface IOptimizations