dotnet · danmoseley · Aug 17, 2022 · Mar 22, 2022 · Mar 23, 2022 · Apr 13, 2022
diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs
@@ -968,13 +968,11 @@ private string ReplaceCore(string oldValue, string? newValue, CompareInfo? ci, C
         //
         public string Replace(char oldChar, char newChar)
         {
-            if (oldChar == newChar)
-                return this;
-
-            int firstIndex = IndexOf(oldChar);
-
-            if (firstIndex < 0)
+            int firstIndex;
+            if (oldChar == newChar || (firstIndex = IndexOf(oldChar)) < 0)
+            {
                 return this;
+            }
 
             int remainingLength = Length - firstIndex;
             string result = FastAllocateString(Length);
@@ -988,35 +986,56 @@ public string Replace(char oldChar, char newChar)
             }
 
             // Copy the remaining characters, doing the replacement as we go.
-            ref ushort pSrc = ref Unsafe.Add(ref Unsafe.As<char, ushort>(ref _firstChar), copyLength);
-            ref ushort pDst = ref Unsafe.Add(ref Unsafe.As<char, ushort>(ref result._firstChar), copyLength);
+            ref ushort pSrc = ref Unsafe.Add(ref Unsafe.As<char, ushort>(ref _firstChar), (nint)(uint)copyLength);
+            ref ushort pDst = ref Unsafe.Add(ref Unsafe.As<char, ushort>(ref result._firstChar), (nint)(uint)copyLength);
+            nint i = 0;
 
             if (Vector.IsHardwareAccelerated && remainingLength >= Vector<ushort>.Count)
             {
-                Vector<ushort> oldChars = new Vector<ushort>(oldChar);
-                Vector<ushort> newChars = new Vector<ushort>(newChar);
+                Vector<ushort> oldChars = new(oldChar);
+                Vector<ushort> newChars = new(newChar);
 
-                do
+                Vector<ushort> original;
+                Vector<ushort> equals;
+                Vector<ushort> results;
+
+                nint lengthToExamine = (nint)(uint)(remainingLength - Vector<ushort>.Count);
+
+                if (lengthToExamine > 0)
                 {
-                    Vector<ushort> original = Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<ushort, byte>(ref pSrc));
-                    Vector<ushort> equals = Vector.Equals(original, oldChars);
-                    Vector<ushort> results = Vector.ConditionalSelect(equals, newChars, original);
-                    Unsafe.WriteUnaligned(ref Unsafe.As<ushort, byte>(ref pDst), results);
-
-                    pSrc = ref Unsafe.Add(ref pSrc, Vector<ushort>.Count);
-                    pDst = ref Unsafe.Add(ref pDst, Vector<ushort>.Count);
-                    remainingLength -= Vector<ushort>.Count;
+                    do
+                    {
+                        original = Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<ushort, byte>(ref Unsafe.Add(ref pSrc, i)));
+                        equals = Vector.Equals(original, oldChars);
+                        results = Vector.ConditionalSelect(equals, newChars, original);
+                        Unsafe.WriteUnaligned(ref Unsafe.As<ushort, byte>(ref Unsafe.Add(ref pDst, i)), results);
+
+                        i += Vector<ushort>.Count;
+                    }
+                    while (i < lengthToExamine);
                 }
-                while (remainingLength >= Vector<ushort>.Count);
-            }
 
-            for (; remainingLength > 0; remainingLength--)
-            {
-                ushort currentChar = pSrc;
-                pDst = currentChar == oldChar ? newChar : currentChar;
+                // There are [0, Vector<ushort>.Count) elements remaining now.
+                // As the operation is idempotent, and we know that in total there are at least Vector<ushort>.Count
+                // elements available, we read a vector from the very end of the string, perform the replace
+                // and write to the destination at the very end.
+                // Thus we can eliminate the scalar processing of the remaining elements.
+                // We perform this operation even if there are 0 elements remaining, as it is cheaper than the
+                // additional check which would introduce a branch here.
 
 // -------------------- For Vector<ushort>.Count == 8 (SSE2 / ARM NEON) -------------------- 
 [InlineData("Aaaaaaaa", 'A', 'a', "aaaaaaaa")] // Single iteration of vectorised path; no remainders through non-vectorised path 
 // Three leading 'a's before a match (copyLength > 0), Single iteration of vectorised path; no remainders through non-vectorised path 
 [InlineData("aaaAaaaaaaa", 'A', 'a', "aaaaaaaaaaa")] 
 // Single iteration of vectorised path; 3 remainders through non-vectorised path 
 [InlineData("AaaaaaaaaAa", 'A', 'a', "aaaaaaaaaaa")] 
 // ------------------------- For Vector<ushort>.Count == 16 (AVX2) ------------------------- 
 [InlineData("AaaaaaaaAaaaaaaa", 'A', 'a', "aaaaaaaaaaaaaaaa")] // Single iteration of vectorised path; no remainders through non-vectorised path 
 // Three leading 'a's before a match (copyLength > 0), Single iteration of vectorised path; no remainders through non-vectorised path 
 [InlineData("aaaAaaaaaaaAaaaaaaa", 'A', 'a', "aaaaaaaaaaaaaaaaaaa")] 
 // Single iteration of vectorised path; 3 remainders through non-vectorised path 
 [InlineData("AaaaaaaaAaaaaaaaaAa", 'A', 'a', "aaaaaaaaaaaaaaaaaaa")] 
 // ----------------------------------- General test data ----------------------------------- 
 // -------------------- For Vector<ushort>.Count == 8 (SSE2 / ARM NEON) -------------------- 
 [InlineData("Aaaaaaaa", 'A', 'a', "aaaaaaaa")] // Single iteration of vectorised path; no remainders through non-vectorised path 
 // Three leading 'a's before a match (copyLength > 0), Single iteration of vectorised path; no remainders through non-vectorised path 
 [InlineData("aaaAaaaaaaa", 'A', 'a', "aaaaaaaaaaa")] 
 // Single iteration of vectorised path; 3 remainders through non-vectorised path 
 [InlineData("AaaaaaaaaAa", 'A', 'a', "aaaaaaaaaaa")] 
 // ------------------------- For Vector<ushort>.Count == 16 (AVX2) ------------------------- 
 [InlineData("AaaaaaaaAaaaaaaa", 'A', 'a', "aaaaaaaaaaaaaaaa")] // Single iteration of vectorised path; no remainders through non-vectorised path 
 // Three leading 'a's before a match (copyLength > 0), Single iteration of vectorised path; no remainders through non-vectorised path 
 [InlineData("aaaAaaaaaaaAaaaaaaa", 'A', 'a', "aaaaaaaaaaaaaaaaaaa")] 
 // Single iteration of vectorised path; 3 remainders through non-vectorised path 
 [InlineData("AaaaaaaaAaaaaaaaaAa", 'A', 'a', "aaaaaaaaaaaaaaaaaaa")] 
 // ----------------------------------- General test data ----------------------------------- 
-                pSrc = ref Unsafe.Add(ref pSrc, 1);
-                pDst = ref Unsafe.Add(ref pDst, 1);
+                i = (nint)(uint)this.Length - Vector<ushort>.Count;
+                original = Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref _firstChar, i)));
+                equals = Vector.Equals(original, oldChars);
+                results = Vector.ConditionalSelect(equals, newChars, original);
+                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref result._firstChar, i)), results);
+            }
+            else
+            {
+                for (; i < (nint)(uint)remainingLength; ++i)
+                {
+                    ushort currentChar = Unsafe.Add(ref pSrc, i);
+                    Unsafe.Add(ref pDst, i) = currentChar == oldChar ? newChar : currentChar;
+                }
             }
 
             return result;