From ca8f6ad78ed39e1dd3b601049cc7a90f01324271 Mon Sep 17 00:00:00 2001
From: Yat Long Poon <poon.yat.long@gmail.com>
Date: Mon, 3 Nov 2025 10:22:26 +0000
Subject: [PATCH 1/8] Integrate SimdUnicode UTF-8 Validation for AdvSimd

---
 THIRD-PARTY-NOTICES.TXT                       |  29 ++
 .../src/System/Text/Ascii.Utility.cs          |   2 +-
 .../Text/Unicode/Utf8Utility.Validation.cs    | 428 +++++++++++++++++-
 .../Unicode/Utf8UtilityTests.ValidateBytes.cs |  81 ++--
 4 files changed, 508 insertions(+), 32 deletions(-)
diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT
index 095be6312166ed..01ef9b39dc7198 100644
--- a/THIRD-PARTY-NOTICES.TXT
+++ b/THIRD-PARTY-NOTICES.TXT
@@ -720,6 +720,35 @@ License for fastmod (https://github.com/lemire/fastmod), ibm-fpgen (https://gith
    See the License for the specific language governing permissions and
    limitations under the License.
 
+License for SimdUnicode (https://github.com/simdutf/SimdUnicode)
+--------------------------------------
+
+   Copyright 2025 Daniel Lemire, Nick Nuon
+   Which is based on "Validating UTF-8 In Less Than One Instruction Per Byte" article available at https://arxiv.org/abs/2010.03090
+   (c) John Keiser, Daniel Lemire
+
+   MIT License
+
+   Copyright (c) 2023 SimdUnicode authors
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
 License for sse4-strstr (https://github.com/WojciechMula/sse4-strstr)
 --------------------------------------
 
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index cdec5bb675b312..4982b1d6598ea8 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -1524,7 +1524,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii
 
 #if NET
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool VectorContainsNonAsciiChar(Vector128<byte> asciiVector)
+        internal static bool VectorContainsNonAsciiChar(Vector128<byte> asciiVector)
         {
             // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii
 
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index 821037a538b3c8..4dac716564b2f8 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -67,6 +67,13 @@ internal static unsafe partial class Utf8Utility
             byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
 #endif
 
+#if NET
+            if (inputLength >= Vector128<byte>.Count && (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
+            {
+                return GetPointerToFirstInvalidByte_Vector128(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+            }
+#endif
+
             while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
             {
                 // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
@@ -138,7 +145,6 @@ internal static unsafe partial class Utf8Utility
                                     trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask) >> 2;
                                     goto LoopTerminatedEarlyDueToNonAsciiData;
                                 }
-
                                 pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs
                             } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop);
                         }
@@ -756,12 +762,430 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
             {
                 throw new PlatformNotSupportedException();
             }
-
             Vector128<byte> mostSignificantBitIsSet = (value.AsSByte() >> 7).AsByte();
             Vector128<byte> extractedBits = mostSignificantBitIsSet & bitMask128;
             extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
             return extractedBits.AsUInt64().ToScalar();
         }
+
+        // The following algorithm based on the SimdUnicode library:
+        //   https://github.com/simdutf/SimdUnicode
+        //   by Daniel Lemire and Nick Nuon
+        // And the paper "Validating UTF-8 In Less Than One Instruction Per Byte":
+        //   https://arxiv.org/pdf/2010.03090
+        //   by John Keiser and Daniel Lemire, 2021
+
+        [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        private static byte* GetPointerToFirstInvalidByte_Vector128(byte* pInputBuffer, int inputLength,
+                out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            Debug.Assert(inputLength >= Vector128<byte>.Count);
+
+            Vector128<byte> prevInputBlock = Vector128<byte>.Zero;
+
+            Vector128<byte> maxValue = Vector128.Create(
+                    255, 255, 255, 255, 255, 255, 255, 255,
+                    255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
+            Vector128<byte> prevIncomplete = Vector128.SubtractSaturate(prevInputBlock, maxValue);
+            int numIncomplete; // maximum number of incomplete bytes to go back
+
+            const byte TOO_SHORT = 1 << 0;
+            const byte TOO_LONG = 1 << 1;
+            const byte OVERLONG_3 = 1 << 2;
+            const byte SURROGATE = 1 << 4;
+            const byte OVERLONG_2 = 1 << 5;
+            const byte TWO_CONTS = 1 << 7;
+            const byte TOO_LARGE = 1 << 3;
+            const byte TOO_LARGE_1000 = 1 << 6;
+            const byte OVERLONG_4 = 1 << 6;
+            const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS;
+
+            Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                    TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                    TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                    TOO_SHORT | OVERLONG_2,
+                    TOO_SHORT,
+                    TOO_SHORT | OVERLONG_3 | SURROGATE,
+                    TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+
+            Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                    CARRY | OVERLONG_2,
+                    CARRY,
+                    CARRY,
+                    CARRY | TOO_LARGE,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000,
+                    CARRY | TOO_LARGE | TOO_LARGE_1000);
+
+            Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                    TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                    TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                    TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                    TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                    TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                    TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+
+            Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
+            Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
+            Vector128<byte> v0f = Vector128.Create((byte)0x0F);
+            Vector128<byte> v80 = Vector128.Create((byte)0x80);
+            Vector128<byte> fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1));
+            Vector128<sbyte> largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111
+
+            int contbytes = 0; // number of continuation bytes in the block
+            int n4 = 0; // number of 4-byte sequences that start in this block
+
+            // For Arm64:
+            //   Instead of updating n4 and contbytes continuously, we accumulate
+            //   the values in n4v and contv, while using overflowCounter to make
+            //   sure we do not overflow. This allows you to reach good performance
+            //   on systems where summing across vectors is slow.
+            Vector128<sbyte> n4v = Vector128<sbyte>.Zero;
+            Vector128<sbyte> contv = Vector128<sbyte>.Zero;
+            int overflowCounter = 0;
+
+            int processedLength = 0;
+            for (; processedLength <= inputLength - Vector128<byte>.Count; processedLength += Vector128<byte>.Count)
+            {
+
+                Vector128<byte> currentBlock = Vector128.Load(pInputBuffer + processedLength);
+                if (!Ascii.VectorContainsNonAsciiChar(currentBlock))
+                {
+                    // We have an ASCII block, no need to process it, but
+                    // we need to check if the previous block was incomplete.
+                    if (prevIncomplete != Vector128<byte>.Zero)
+                    {
+                        numIncomplete = Vector128<byte>.Count - 3;
+                        goto RewindPointerAndAdjustCounters;
+                    }
+
+                    // Often, we have a lot of ASCII characters in a row.
+                    int localasciirun = Vector128<byte>.Count;
+                    if (processedLength + localasciirun + Vector128<byte>.Count <= inputLength)
+                    {
+                        Vector128<byte> block = Vector128.Load(pInputBuffer + processedLength + localasciirun);
+                        if (!Ascii.VectorContainsNonAsciiChar(block))
+                        {
+                            localasciirun += Vector128<byte>.Count;
+                            for (; localasciirun <= inputLength - processedLength - (Vector128<byte>.Count * 4); localasciirun += (Vector128<byte>.Count * 4))
+                            {
+                                Vector128<byte> block1 = Vector128.Load(pInputBuffer + processedLength + localasciirun);
+                                Vector128<byte> block2 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128<byte>.Count * 1));
+                                Vector128<byte> block3 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128<byte>.Count * 2));
+                                Vector128<byte> block4 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128<byte>.Count * 3));
+                                if (Ascii.VectorContainsNonAsciiChar(block1 | block2 | block3 | block4))
+                                {
+                                    break;
+                                }
+                            }
+                        }
+                        processedLength += localasciirun - Vector128<byte>.Count;
+                    }
+                }
+                else
+                {
+                    // Contains non-ASCII characters, we need to do non-trivial processing
+
+                    Vector128<byte> prev1;
+                    Vector128<byte> prev2;
+                    Vector128<byte> prev3;
+                    Vector128<byte> byte1High;
+                    Vector128<byte> byte1Low;
+                    Vector128<byte> byte2High;
+
+                    // TODO: Support for other architectures can be added here.
+
+                    if (AdvSimd.Arm64.IsSupported)
+                    {
+                        prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1));
+                        byte1High = AdvSimd.Arm64.VectorTableLookup(shuf1, prev1 >>> 4);
+                        byte1Low = AdvSimd.Arm64.VectorTableLookup(shuf2, (prev1 & v0f));
+                        byte2High = AdvSimd.Arm64.VectorTableLookup(shuf3, currentBlock >>> 4);
+                        prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2));
+                        prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3));
+                    }
+                    else
+                    {
+                        throw new PlatformNotSupportedException();
+                    }
+
+                    prevInputBlock = currentBlock;
+                    Vector128<byte> twoBytesError = byte1High & byte1Low & byte2High;
+                    Vector128<byte> isThirdByte = Vector128.SubtractSaturate(prev2, thirdByte);
+                    Vector128<byte> isFourthByte = Vector128.SubtractSaturate(prev3, fourthByte);
+                    Vector128<byte> must23 = isThirdByte | isFourthByte;
+                    Vector128<byte> must23As80 = must23 & v80;
+                    Vector128<byte> error = must23As80 ^ twoBytesError;
+                    if (error != Vector128<byte>.Zero)
+                    {
+                        numIncomplete = (processedLength == 0) ? 0 : 3;
+                        goto RewindPointerAndAdjustCounters;
+                    }
+                    prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue);
+
+                    // For Arm64, use contv and n4v to accumulate the sum for better performance.
+                    // Otherwise, increment the adjustments directly on every iteration.
+
+                    if (AdvSimd.Arm64.IsSupported)
+                    {
+                        contv += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont);
+                        n4v += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
+                        overflowCounter++;
+                        // We have a risk of overflow if overflowCounter reaches 255,
+                        // in which case, we empty contv and n4v, and update contbytes and
+                        // n4.
+                        if (overflowCounter == 0xff)
+                        {
+                            overflowCounter = 0;
+                            contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                            contv = Vector128<sbyte>.Zero;
+                            if (n4v != Vector128<sbyte>.Zero)
+                            {
+                                n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                                n4v = Vector128<sbyte>.Zero;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        contbytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits());
+                        n4 += BitOperations.PopCount(Vector128.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits());
+                    }
+                }
+            }
+
+            bool hasIncomplete = prevIncomplete != Vector128<byte>.Zero;
+            if (processedLength == inputLength && !hasIncomplete)
+            {
+                // No invalid byte is found across the whole input length.
+
+                if (AdvSimd.Arm64.IsSupported)
+                {
+                    contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                    if (n4v != Vector128<sbyte>.Zero)
+                    {
+                        n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                    }
+                }
+                else
+                {
+                    // Do nothing since contbytes and n4 were incremented directly.
+                }
+
+                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                return pInputBuffer + inputLength;
+            }
+
+            numIncomplete = hasIncomplete ? 3 : 0;
+
+        RewindPointerAndAdjustCounters:
+
+            if (AdvSimd.Arm64.IsSupported)
+            {
+                contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                if (n4v != Vector128<sbyte>.Zero)
+                {
+                    n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                }
+            }
+            else
+            {
+                // Do nothing since contbytes and n4 were incremented directly.
+            }
+
+            // Find the first invalid byte, going back if necessary.
+            // Then, adjust the counters 'n4' and 'contbytes', since we might be
+            // overcounting or undercounting them during processing.
+
+            byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(numIncomplete, pInputBuffer + processedLength - numIncomplete, inputLength - processedLength + numIncomplete);
+
+            AdjustCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+
+            (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+            return invalidBytePointer;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len)
+        {
+            // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of
+            // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the
+            // pointer to the first invalid byte.
+
+            int extraLen = 0;
+            bool foundLeadingBytes = false;
+
+            for (int i = 0; i <= howFarBack; i++)
+            {
+                byte candidateByte = buf[0 - i];
+                foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
+
+                if (foundLeadingBytes)
+                {
+                    buf -= i;
+                    extraLen = i;
+                    break;
+                }
+            }
+
+            if (!foundLeadingBytes)
+            {
+                return buf - howFarBack;
+            }
+            int pos = 0;
+            int nextPos;
+            uint codePoint = 0;
+
+            len += extraLen;
+
+            while (pos < len)
+            {
+
+                byte firstByte = buf[pos];
+
+                while (firstByte < 0b10000000)
+                {
+                    if (++pos == len)
+                    {
+                        return buf + len;
+                    }
+                    firstByte = buf[pos];
+                }
+
+                if ((firstByte & 0b11100000) == 0b11000000)
+                {
+                    nextPos = pos + 2;
+                    if (nextPos > len)
+                    {
+                        return buf + pos;
+                    } // Too short
+                    if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+                    {
+                        return buf + pos;
+                    } // Too short
+                    // range check
+                    codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buf[pos + 1] & 0b00111111);
+                    if ((codePoint < 0x80) || (0x7ff < codePoint))
+                    {
+                        return buf + pos;
+                    } // Overlong
+                }
+                else if ((firstByte & 0b11110000) == 0b11100000)
+                {
+                    nextPos = pos + 3;
+                    if (nextPos > len)
+                    {
+                        return buf + pos;
+                    } // Too short
+                    // range check
+                    codePoint = (uint)(firstByte & 0b00001111) << 12 |
+                                 (uint)(buf[pos + 1] & 0b00111111) << 6 |
+                                 (uint)(buf[pos + 2] & 0b00111111);
+                    // Either overlong or too large:
+                    if ((codePoint < 0x800) || (0xffff < codePoint) ||
+                        (0xd7ff < codePoint && codePoint < 0xe000))
+                    {
+                        return buf + pos;
+                    }
+                    if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+                    {
+                        return buf + pos;
+                    } // Too short
+                    if ((buf[pos + 2] & 0b11000000) != 0b10000000)
+                    {
+                        return buf + pos;
+                    } // Too short
+                }
+                else if ((firstByte & 0b11111000) == 0b11110000)
+                {
+                    nextPos = pos + 4;
+                    if (nextPos > len)
+                    {
+                        return buf + pos;
+                    }
+                    if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+                    {
+                        return buf + pos;
+                    }
+                    if ((buf[pos + 2] & 0b11000000) != 0b10000000)
+                    {
+                        return buf + pos;
+                    }
+                    if ((buf[pos + 3] & 0b11000000) != 0b10000000)
+                    {
+                        return buf + pos;
+                    }
+                    // range check
+                    codePoint =
+                        (uint)(firstByte & 0b00000111) << 18 | (uint)(buf[pos + 1] & 0b00111111) << 12 |
+                        (uint)(buf[pos + 2] & 0b00111111) << 6 | (uint)(buf[pos + 3] & 0b00111111);
+                    if (codePoint <= 0xffff || 0x10ffff < codePoint)
+                    {
+                        return buf + pos;
+                    }
+                }
+                else
+                {
+                    // we may have a continuation/too long error
+                    return buf + pos;
+                }
+                pos = nextPos;
+            }
+
+            return buf + len; // no error
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int n4, ref int contbytes)
+        {
+            if (pInvalid < pProcessed)
+            {
+                for (byte* p = pInvalid; p < pProcessed; p++)
+                {
+                    if ((*p & 0b11000000) == 0b10000000)
+                    {
+                        contbytes -= 1;
+                    }
+                    if ((*p & 0b11110000) == 0b11110000)
+                    {
+                        n4 -= 1;
+                    }
+                }
+            }
+            else
+            {
+                for (byte* p = pProcessed; p < pInvalid; p++)
+                {
+                    if ((*p & 0b11000000) == 0b10000000)
+                    {
+                        contbytes += 1;
+                    }
+                    if ((*p & 0b11110000) == 0b11110000)
+                    {
+                        n4 += 1;
+                    }
+                }
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int n4, int contbytes)
+        {
+            int n3 = -2 * n4 + 2 * contbytes;
+            int n2 = n4 - 3 * contbytes;
+            int utfadjust = -2 * n4 - 2 * n3 - n2;
+            int scalaradjust = -n4;
+
+            return (utfadjust, scalaradjust);
+        }
 #endif
     }
 }
diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
index 4730337b0878ed..0cfbb8afb6b847 100644
--- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
+++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Buffers;
+using System.Collections;
 using System.Linq;
 using System.Reflection;
 using System.Runtime.InteropServices;
@@ -241,6 +242,13 @@ public void GetIndexOfFirstInvalidUtf8Sequence_WithOutOfRangeFourByteSequences_R
             {
                 AssertIsInvalidFourByteSequence(new byte[] { 0xF4, (byte)i, 0x80, 0x80 });
             }
+
+            // [ F5..FF ] [ 80..BF ] [ 80..BF ] [ 80..BF ] is out-of-range 4-byte sequence
+
+            for (int i = 0xF5; i < 0x100; i++)
+            {
+                AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x80, 0x80, 0x80 });
+            }
         }
 
         [Fact]
@@ -279,57 +287,72 @@ private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence)
         {
             Assert.Equal(2, invalidSequence.Length);
 
-            byte[] knownGoodBytes = Utf8Tests.DecodeHex(E_ACUTE);
-
-            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+            // Exercise the vectorized codepath and insert the invalid sequence at different positions.
 
-            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of first DWORD
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 2, 1, 0);
+            byte[] byteVector = Utf8Tests.DecodeHex(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE);
 
-            // Run the same tests but with extra data at the beginning so that we're inside one of
-            // the 2-byte processing "hot loop" code paths.
+            for (int pos = 0; pos <= 16; pos++)
+            {
+                ArrayList testList = new ArrayList(byteVector);
 
-            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of next DWORD
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 2, 0);
+                if (pos % 2 != 0)
+                {
+                    // Replace bytes with valid ASCII characters so they can be broken up.
+                    testList.SetRange(pos - pos % 2, new byte[2] {0x20, 0x21});
+                }
 
-            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of next DWORD
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 3, 0);
+                testList.InsertRange(pos, invalidSequence);
+                byte[] toTest = (byte[])testList.ToArray(typeof(byte));
+                GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos + 1) / 2, 0);
+            }
         }
 
         private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence)
         {
             Assert.Equal(3, invalidSequence.Length);
 
-            byte[] knownGoodBytes = Utf8Tests.DecodeHex(EURO_SYMBOL);
-
-            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+            // Exercise the vectorized codepath and insert the invalid sequence at different positions.
 
-            // Run the same tests but with extra data at the beginning so that we're inside one of
-            // the 3-byte processing "hot loop" code paths.
+            byte[] byteVector = Utf8Tests.DecodeHex(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL);
 
-            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling first and second DWORDs
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 3, 1, 0);
+            for (int pos = 0; pos <= 16; pos++)
+            {
+                ArrayList testList = new ArrayList(byteVector);
 
-            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling second and third DWORDs
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 2, 0);
+                if (pos % 3 != 0)
+                {
+                    // Replace bytes with valid ASCII characters so they can be broken up.
+                    testList.SetRange(pos - pos % 3, new byte[3] {0x20, 0x21, 0x22});
+                }
 
-            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of third DWORD
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 9, 3, 0);
+                testList.InsertRange(pos, invalidSequence);
+                byte[] toTest = (byte[])testList.ToArray(typeof(byte));
+                GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 3) + (pos % 3), 0);
+            }
         }
 
         private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence)
         {
             Assert.Equal(4, invalidSequence.Length);
 
-            byte[] knownGoodBytes = Utf8Tests.DecodeHex(GRINNING_FACE);
+            // Exercise the vectorized codepath and insert the invalid sequence at different positions.
+
+            byte[] byteVector = Utf8Tests.DecodeHex(GRINNING_FACE + GRINNING_FACE + GRINNING_FACE + GRINNING_FACE);
 
-            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray();
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+            for (int pos = 0; pos <= 16; pos++)
+            {
+                ArrayList testList = new ArrayList(byteVector);
 
-            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray();
-            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 1, 1);
+                if (pos % 4 != 0)
+                {
+                    // Replace bytes with valid ASCII characters so they can be broken up.
+                    testList.SetRange(pos - pos % 4, new byte[4] {0x20, 0x21, 0x22, 0x23});
+                }
+
+                testList.InsertRange(pos, invalidSequence);
+                byte[] toTest = (byte[])testList.ToArray(typeof(byte));
+                GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 4) + (pos % 4), pos / 4);
+            }
         }
 
         private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(string inputHex, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)

From a711ed03cb44e89e998237fa4f6ce860346be3aa Mon Sep 17 00:00:00 2001
From: Yat Long Poon <poon.yat.long@gmail.com>
Date: Thu, 18 Dec 2025 10:12:08 +0000
Subject: [PATCH 2/8] Fix overflow counter

contv and n4v are Vector128<sbyte> so the largest positive value is 127.
---
 .../src/System/Text/Unicode/Utf8Utility.Validation.cs         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index 4dac716564b2f8..2f5d21efffad31 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -939,10 +939,10 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                         contv += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont);
                         n4v += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
                         overflowCounter++;
-                        // We have a risk of overflow if overflowCounter reaches 255,
+                        // We have a risk of overflow if overflowCounter reaches 127,
                         // in which case, we empty contv and n4v, and update contbytes and
                         // n4.
-                        if (overflowCounter == 0xff)
+                        if (overflowCounter == 0x7f)
                         {
                             overflowCounter = 0;
                             contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();

From b594b1579c88417494e12a30a45ca1bb05251963 Mon Sep 17 00:00:00 2001
From: Yat Long Poon <poon.yat.long@gmail.com>
Date: Wed, 18 Mar 2026 13:31:46 +0000
Subject: [PATCH 3/8] Add comments and fix variable names

---
 .../Text/Unicode/Utf8Utility.Validation.cs    | 152 ++++++++++--------
 1 file changed, 88 insertions(+), 64 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index 2f5d21efffad31..ff36204d87ca08 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -145,6 +145,7 @@ internal static unsafe partial class Utf8Utility
                                     trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask) >> 2;
                                     goto LoopTerminatedEarlyDueToNonAsciiData;
                                 }
+
                                 pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs
                             } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop);
                         }
@@ -762,6 +763,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
             {
                 throw new PlatformNotSupportedException();
             }
+
             Vector128<byte> mostSignificantBitIsSet = (value.AsSByte() >> 7).AsByte();
             Vector128<byte> extractedBits = mostSignificantBitIsSet & bitMask128;
             extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
@@ -783,24 +785,37 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
 
             Vector128<byte> prevInputBlock = Vector128<byte>.Zero;
 
+            // This is used to detect whether the previous block of contains incomplete sequences.
+            // It contains the maximum values the previous bytes can be without generating a carry.
+            // If we see larger values, it means we need to go back and validate.
+            // The first 13 bytes can never generate a carry for a valid UTF-8 byte sequence, the
+            // last 3 bytes are the maximum starting byte of a 3-byte, 2-byte and 1-byte sequence
+            // respectively.
             Vector128<byte> maxValue = Vector128.Create(
                     255, 255, 255, 255, 255, 255, 255, 255,
                     255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
             Vector128<byte> prevIncomplete = Vector128.SubtractSaturate(prevInputBlock, maxValue);
             int numIncomplete; // maximum number of incomplete bytes to go back
 
-            const byte TOO_SHORT = 1 << 0;
-            const byte TOO_LONG = 1 << 1;
-            const byte OVERLONG_3 = 1 << 2;
-            const byte SURROGATE = 1 << 4;
-            const byte OVERLONG_2 = 1 << 5;
-            const byte TWO_CONTS = 1 << 7;
-            const byte TOO_LARGE = 1 << 3;
-            const byte TOO_LARGE_1000 = 1 << 6;
-            const byte OVERLONG_4 = 1 << 6;
-            const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS;
-
-            Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+            // The error bit encoding is slightly different from the paper, instead it follows the
+            // SimdUnicode implementation. TOO_LARGE_1000 and OVERLONG_4 can share the same bit as
+            // their conditions are mutually exclusive.
+            const byte TOO_SHORT = 1 << 0;      // Sequence is missing continuation bytes
+            const byte TOO_LONG = 1 << 1;       // ASCII byte is followed by a continuation byte
+            const byte OVERLONG_3 = 1 << 2;     // Character is out-of-range for a 3-byte sequence
+            const byte SURROGATE = 1 << 4;      // Character range is reserved for UTF-16 surrogates
+            const byte OVERLONG_2 = 1 << 5;     // Character is out-of-range for a 2-byte sequence
+            const byte TWO_CONTS = 1 << 7;      // (Not an error) Two continuation bytes
+            const byte TOO_LARGE = 1 << 3;      // Character is larger than the largest Unicode character
+            const byte TOO_LARGE_1000 = 1 << 6; // Same as TOO_LARGE, but the 2nd byte starts with 0x1000
+            const byte OVERLONG_4 = 1 << 6;     // Character is out-of-range for a 4-byte sequence
+            const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // A common case for continuation bytes
+
+            // The 3 lookup tables used to map nibbles of consecutive bytes to possible errors in each case.
+            // A 4-bit nibble from the upper or lower half of a byte is used as an index (0-16) to lookup the
+            // corresponding error mask from the 128-bit vector.
+
+            Vector128<byte> tableByte1High = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
                     TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
                     TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
                     TOO_SHORT | OVERLONG_2,
@@ -808,7 +823,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     TOO_SHORT | OVERLONG_3 | SURROGATE,
                     TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
 
-            Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+            Vector128<byte> tableByte1Low = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
                     CARRY | OVERLONG_2,
                     CARRY,
                     CARRY,
@@ -825,7 +840,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     CARRY | TOO_LARGE | TOO_LARGE_1000,
                     CARRY | TOO_LARGE | TOO_LARGE_1000);
 
-            Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+            Vector128<byte> tableByte2High = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
                     TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
                     TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
                     TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
@@ -835,21 +850,21 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
 
             Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
             Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
-            Vector128<byte> v0f = Vector128.Create((byte)0x0F);
-            Vector128<byte> v80 = Vector128.Create((byte)0x80);
+            Vector128<byte> vec0F = Vector128.Create((byte)0x0F);
+            Vector128<byte> vec80 = Vector128.Create((byte)0x80);
             Vector128<byte> fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1));
-            Vector128<sbyte> largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111
+            Vector128<sbyte> largestContinuationByte = Vector128.Create((sbyte)-65); // -65 => 0b10111111
 
-            int contbytes = 0; // number of continuation bytes in the block
-            int n4 = 0; // number of 4-byte sequences that start in this block
+            int numContinuationBytes = 0; // number of continuation bytes in the block
+            int numFourByteSequences = 0; // number of 4-byte sequences that start in this block
 
             // For Arm64:
-            //   Instead of updating n4 and contbytes continuously, we accumulate
-            //   the values in n4v and contv, while using overflowCounter to make
+            //   Instead of updating numFourByteSequences and numContinuationBytes continuously, we accumulate
+            //   the values in vecFourByteSequences and vecContinuationBytes, while using overflowCounter to make
             //   sure we do not overflow. This allows you to reach good performance
             //   on systems where summing across vectors is slow.
-            Vector128<sbyte> n4v = Vector128<sbyte>.Zero;
-            Vector128<sbyte> contv = Vector128<sbyte>.Zero;
+            Vector128<sbyte> vecFourByteSequences = Vector128<sbyte>.Zero;
+            Vector128<sbyte> vecContinuationBytes = Vector128<sbyte>.Zero;
             int overflowCounter = 0;
 
             int processedLength = 0;
@@ -906,9 +921,9 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     if (AdvSimd.Arm64.IsSupported)
                     {
                         prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1));
-                        byte1High = AdvSimd.Arm64.VectorTableLookup(shuf1, prev1 >>> 4);
-                        byte1Low = AdvSimd.Arm64.VectorTableLookup(shuf2, (prev1 & v0f));
-                        byte2High = AdvSimd.Arm64.VectorTableLookup(shuf3, currentBlock >>> 4);
+                        byte1High = AdvSimd.Arm64.VectorTableLookup(tableByte1High, prev1 >>> 4);
+                        byte1Low = AdvSimd.Arm64.VectorTableLookup(tableByte1Low, (prev1 & vec0F));
+                        byte2High = AdvSimd.Arm64.VectorTableLookup(tableByte2High, currentBlock >>> 4);
                         prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2));
                         prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3));
                     }
@@ -918,46 +933,56 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     }
 
                     prevInputBlock = currentBlock;
+
+                    // Find invalid 2-byte sequences by matching the error bits from the table lookups.
                     Vector128<byte> twoBytesError = byte1High & byte1Low & byte2High;
+
+                    // Check if the sequences with two continuation bytes are valid.
+                    // This is only possible for 3 or 4-byte sequences, then we match the expected occurences
+                    // against the MSB from the table lookup results.
                     Vector128<byte> isThirdByte = Vector128.SubtractSaturate(prev2, thirdByte);
                     Vector128<byte> isFourthByte = Vector128.SubtractSaturate(prev3, fourthByte);
-                    Vector128<byte> must23 = isThirdByte | isFourthByte;
-                    Vector128<byte> must23As80 = must23 & v80;
-                    Vector128<byte> error = must23As80 ^ twoBytesError;
+                    Vector128<byte> twoContinuationBytes = (isThirdByte | isFourthByte) & vec80; // Extract the MSB
+                    Vector128<byte> error = twoContinuationBytes ^ twoBytesError;
+
                     if (error != Vector128<byte>.Zero)
                     {
+                        // Error is found if the error bit mask is non-zero.
                         numIncomplete = (processedLength == 0) ? 0 : 3;
                         goto RewindPointerAndAdjustCounters;
                     }
+
                     prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue);
 
-                    // For Arm64, use contv and n4v to accumulate the sum for better performance.
+                    // For Arm64, use vecContinuationBytes and vecFourByteSequences to accumulate the sum for better performance.
                     // Otherwise, increment the adjustments directly on every iteration.
 
                     if (AdvSimd.Arm64.IsSupported)
                     {
-                        contv += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont);
-                        n4v += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
+                        vecContinuationBytes += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestContinuationByte);
+                        vecFourByteSequences += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
                         overflowCounter++;
                         // We have a risk of overflow if overflowCounter reaches 127,
-                        // in which case, we empty contv and n4v, and update contbytes and
-                        // n4.
+                        // in which case, we empty vecContinuationBytes and vecFourByteSequences, and update numContinuationBytes and
+                        // numFourByteSequences.
                         if (overflowCounter == 0x7f)
                         {
                             overflowCounter = 0;
-                            contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
-                            contv = Vector128<sbyte>.Zero;
-                            if (n4v != Vector128<sbyte>.Zero)
+
+                            // The vector results are negative, so subtract to make the scalar positive.
+                            numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar();
+                            vecContinuationBytes = Vector128<sbyte>.Zero;
+                            if (vecFourByteSequences != Vector128<sbyte>.Zero)
                             {
-                                n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
-                                n4v = Vector128<sbyte>.Zero;
+                                numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar();
+                                vecFourByteSequences = Vector128<sbyte>.Zero;
                             }
                         }
                     }
                     else
                     {
-                        contbytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits());
-                        n4 += BitOperations.PopCount(Vector128.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits());
+                        numContinuationBytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits());
+                        numFourByteSequences += BitOperations.PopCount(Vector128.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits());
                     }
                 }
             }
@@ -969,18 +994,18 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
 
                 if (AdvSimd.Arm64.IsSupported)
                 {
-                    contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
-                    if (n4v != Vector128<sbyte>.Zero)
+                    numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar();
+                    if (vecFourByteSequences != Vector128<sbyte>.Zero)
                     {
-                        n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                        numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar();
                     }
                 }
                 else
                 {
-                    // Do nothing since contbytes and n4 were incremented directly.
+                    // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly.
                 }
 
-                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes);
                 return pInputBuffer + inputLength;
             }
 
@@ -990,26 +1015,26 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
 
             if (AdvSimd.Arm64.IsSupported)
             {
-                contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
-                if (n4v != Vector128<sbyte>.Zero)
+                numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar();
+                if (vecFourByteSequences != Vector128<sbyte>.Zero)
                 {
-                    n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                    numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar();
                 }
             }
             else
             {
-                // Do nothing since contbytes and n4 were incremented directly.
+                // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly.
             }
 
             // Find the first invalid byte, going back if necessary.
-            // Then, adjust the counters 'n4' and 'contbytes', since we might be
+            // Then, adjust the counters 'numFourByteSequences' and 'numContinuationBytes', since we might be
             // overcounting or undercounting them during processing.
 
             byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(numIncomplete, pInputBuffer + processedLength - numIncomplete, inputLength - processedLength + numIncomplete);
 
-            AdjustCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+            AdjustCounters(pInputBuffer + processedLength, invalidBytePointer, ref numFourByteSequences, ref numContinuationBytes);
 
-            (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+            (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes);
             return invalidBytePointer;
         }
 
@@ -1048,7 +1073,6 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
 
             while (pos < len)
             {
-
                 byte firstByte = buf[pos];
 
                 while (firstByte < 0b10000000)
@@ -1144,7 +1168,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int n4, ref int contbytes)
+        private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int numFourByteSequences, ref int numContinuationBytes)
         {
             if (pInvalid < pProcessed)
             {
@@ -1152,11 +1176,11 @@ private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref
                 {
                     if ((*p & 0b11000000) == 0b10000000)
                     {
-                        contbytes -= 1;
+                        numContinuationBytes -= 1;
                     }
                     if ((*p & 0b11110000) == 0b11110000)
                     {
-                        n4 -= 1;
+                        numFourByteSequences -= 1;
                     }
                 }
             }
@@ -1166,23 +1190,23 @@ private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref
                 {
                     if ((*p & 0b11000000) == 0b10000000)
                     {
-                        contbytes += 1;
+                        numContinuationBytes += 1;
                     }
                     if ((*p & 0b11110000) == 0b11110000)
                     {
-                        n4 += 1;
+                        numFourByteSequences += 1;
                     }
                 }
             }
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int n4, int contbytes)
+        private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int numFourByteSequences, int numContinuationBytes)
         {
-            int n3 = -2 * n4 + 2 * contbytes;
-            int n2 = n4 - 3 * contbytes;
-            int utfadjust = -2 * n4 - 2 * n3 - n2;
-            int scalaradjust = -n4;
+            int n3 = -2 * numFourByteSequences + 2 * numContinuationBytes;
+            int n2 = numFourByteSequences - 3 * numContinuationBytes;
+            int utfadjust = -2 * numFourByteSequences - 2 * n3 - n2;
+            int scalaradjust = -numFourByteSequences;
 
             return (utfadjust, scalaradjust);
         }

From 924598d52f06961326657034e59a55b091d8e3fa Mon Sep 17 00:00:00 2001
From: Yat Long Poon <poon.yat.long@gmail.com>
Date: Wed, 18 Mar 2026 15:54:56 +0000
Subject: [PATCH 4/8] Address code review suggestions

---
 .../Text/Unicode/Utf8Utility.Validation.cs    | 87 +++++++++----------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index ff36204d87ca08..4427324c5f3f7e 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -795,7 +795,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     255, 255, 255, 255, 255, 255, 255, 255,
                     255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
             Vector128<byte> prevIncomplete = Vector128.SubtractSaturate(prevInputBlock, maxValue);
-            int numIncomplete; // maximum number of incomplete bytes to go back
+            int numIncomplete = 0; // maximum number of incomplete bytes to go back
 
             // The error bit encoding is slightly different from the paper, instead it follows the
             // SimdUnicode implementation. TOO_LARGE_1000 and OVERLONG_4 can share the same bit as
@@ -867,10 +867,11 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
             Vector128<sbyte> vecContinuationBytes = Vector128<sbyte>.Zero;
             int overflowCounter = 0;
 
+            bool foundError = false;
+
             int processedLength = 0;
             for (; processedLength <= inputLength - Vector128<byte>.Count; processedLength += Vector128<byte>.Count)
             {
-
                 Vector128<byte> currentBlock = Vector128.Load(pInputBuffer + processedLength);
                 if (!Ascii.VectorContainsNonAsciiChar(currentBlock))
                 {
@@ -879,30 +880,37 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     if (prevIncomplete != Vector128<byte>.Zero)
                     {
                         numIncomplete = Vector128<byte>.Count - 3;
-                        goto RewindPointerAndAdjustCounters;
+                        foundError = true;
+                        break;
                     }
 
                     // Often, we have a lot of ASCII characters in a row.
-                    int localasciirun = Vector128<byte>.Count;
-                    if (processedLength + localasciirun + Vector128<byte>.Count <= inputLength)
+                    if (processedLength + (Vector128<byte>.Count * 2) <= inputLength)
                     {
-                        Vector128<byte> block = Vector128.Load(pInputBuffer + processedLength + localasciirun);
+                        byte* pAsciiRunStart   = pInputBuffer + processedLength;
+                        byte* pAsciiRunCurrent = pAsciiRunStart + Vector128<byte>.Count; // The first block is already checked
+
+                        Vector128<byte> block = Vector128.Load(pAsciiRunCurrent);
                         if (!Ascii.VectorContainsNonAsciiChar(block))
                         {
-                            localasciirun += Vector128<byte>.Count;
-                            for (; localasciirun <= inputLength - processedLength - (Vector128<byte>.Count * 4); localasciirun += (Vector128<byte>.Count * 4))
+                            // If we see more ASCII characters, unroll the loop by 4.
+
+                            byte* pAsciiRunEnd = pAsciiRunStart + inputLength - processedLength - (Vector128<byte>.Count * 4);
+
+                            for (; pAsciiRunCurrent <= pAsciiRunEnd; pAsciiRunCurrent += (Vector128<byte>.Count * 4))
                             {
-                                Vector128<byte> block1 = Vector128.Load(pInputBuffer + processedLength + localasciirun);
-                                Vector128<byte> block2 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128<byte>.Count * 1));
-                                Vector128<byte> block3 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128<byte>.Count * 2));
-                                Vector128<byte> block4 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128<byte>.Count * 3));
+                                Vector128<byte> block1 = Vector128.Load(pAsciiRunCurrent);
+                                Vector128<byte> block2 = Vector128.Load(pAsciiRunCurrent + (Vector128<byte>.Count * 1));
+                                Vector128<byte> block3 = Vector128.Load(pAsciiRunCurrent + (Vector128<byte>.Count * 2));
+                                Vector128<byte> block4 = Vector128.Load(pAsciiRunCurrent + (Vector128<byte>.Count * 3));
                                 if (Ascii.VectorContainsNonAsciiChar(block1 | block2 | block3 | block4))
                                 {
                                     break;
                                 }
                             }
                         }
-                        processedLength += localasciirun - Vector128<byte>.Count;
+
+                        processedLength += (int)(pAsciiRunCurrent - pAsciiRunStart) - Vector128<byte>.Count;
                     }
                 }
                 else
@@ -949,7 +957,8 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     {
                         // Error is found if the error bit mask is non-zero.
                         numIncomplete = (processedLength == 0) ? 0 : 3;
-                        goto RewindPointerAndAdjustCounters;
+                        foundError = true;
+                        break;
                     }
 
                     prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue);
@@ -981,38 +990,13 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     }
                     else
                     {
-                        numContinuationBytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits());
-                        numFourByteSequences += BitOperations.PopCount(Vector128.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits());
+                        numContinuationBytes += Vector128.CountWhereAllBitsSet(byte2High);
+                        numFourByteSequences += Vector128.CountWhereAllBitsSet(Vector128.SubtractSaturate(currentBlock, fourthByte));
                     }
                 }
             }
 
-            bool hasIncomplete = prevIncomplete != Vector128<byte>.Zero;
-            if (processedLength == inputLength && !hasIncomplete)
-            {
-                // No invalid byte is found across the whole input length.
-
-                if (AdvSimd.Arm64.IsSupported)
-                {
-                    numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar();
-                    if (vecFourByteSequences != Vector128<sbyte>.Zero)
-                    {
-                        numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar();
-                    }
-                }
-                else
-                {
-                    // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly.
-                }
-
-                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes);
-                return pInputBuffer + inputLength;
-            }
-
-            numIncomplete = hasIncomplete ? 3 : 0;
-
-        RewindPointerAndAdjustCounters:
-
+            // Sum up the remaining values from vecContinuationBytes and vecFourByteSequences.
             if (AdvSimd.Arm64.IsSupported)
             {
                 numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar();
@@ -1026,6 +1010,21 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                 // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly.
             }
 
+            if (!foundError)
+            {
+                bool hasIncomplete = prevIncomplete != Vector128<byte>.Zero;
+                if (processedLength == inputLength && !hasIncomplete)
+                {
+                    // No invalid byte is found across the whole input length.
+
+                    (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes);
+                    return pInputBuffer + inputLength;
+                }
+
+                // Still has incomplete data to validate.
+                numIncomplete = hasIncomplete ? 3 : 0;
+            }
+
             // Find the first invalid byte, going back if necessary.
             // Then, adjust the counters 'numFourByteSequences' and 'numContinuationBytes', since we might be
             // overcounting or undercounting them during processing.
@@ -1039,7 +1038,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len)
+        private static byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len)
         {
             // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of
             // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the
@@ -1168,7 +1167,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int numFourByteSequences, ref int numContinuationBytes)
+        private static void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int numFourByteSequences, ref int numContinuationBytes)
         {
             if (pInvalid < pProcessed)
             {

From 6aecaaf6e2f459a420cef8b1fbe6f39c3ed21145 Mon Sep 17 00:00:00 2001
From: Yat Long Poon <poon.yat.long@gmail.com>
Date: Thu, 19 Mar 2026 12:21:53 +0000
Subject: [PATCH 5/8] Replace pointers with spans in helpers

---
 .../Text/Unicode/Utf8Utility.Validation.cs    | 127 +++++++++---------
 1 file changed, 63 insertions(+), 64 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index 4427324c5f3f7e..e72eb43c52b572 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -879,7 +879,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     // we need to check if the previous block was incomplete.
                     if (prevIncomplete != Vector128<byte>.Zero)
                     {
-                        numIncomplete = Vector128<byte>.Count - 3;
+                        numIncomplete = 3;
                         foundError = true;
                         break;
                     }
@@ -1029,155 +1029,154 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
             // Then, adjust the counters 'numFourByteSequences' and 'numContinuationBytes', since we might be
             // overcounting or undercounting them during processing.
 
-            byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(numIncomplete, pInputBuffer + processedLength - numIncomplete, inputLength - processedLength + numIncomplete);
+            ReadOnlySpan<byte> inputSpan = new ReadOnlySpan<byte>(pInputBuffer, inputLength);
 
-            AdjustCounters(pInputBuffer + processedLength, invalidBytePointer, ref numFourByteSequences, ref numContinuationBytes);
+            int invalidIndex = SimpleRewindAndValidateWithErrors(numIncomplete, inputSpan, processedLength - numIncomplete);
+            byte* invalidBytePointer = pInputBuffer + invalidIndex;
+
+            AdjustCounters(inputSpan, processedLength, invalidIndex, ref numFourByteSequences, ref numContinuationBytes);
 
             (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes);
             return invalidBytePointer;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len)
+        private static int SimpleRewindAndValidateWithErrors(int howFarBack, ReadOnlySpan<byte> buffer, int startIndex)
         {
-            // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of
-            // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the
-            // pointer to the first invalid byte.
+            // We scan from startIndex forward, possibly going back howFarBack bytes, to find the end of
+            // a valid UTF-8 sequence. We return buffer.Length if the buffer is valid, otherwise we return
+            // the index of the first invalid byte
 
-            int extraLen = 0;
             bool foundLeadingBytes = false;
 
             for (int i = 0; i <= howFarBack; i++)
             {
-                byte candidateByte = buf[0 - i];
+                byte candidateByte = buffer[startIndex - i];
                 foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
 
                 if (foundLeadingBytes)
                 {
-                    buf -= i;
-                    extraLen = i;
+                    startIndex -= i;
                     break;
                 }
             }
 
             if (!foundLeadingBytes)
             {
-                return buf - howFarBack;
+                return startIndex - howFarBack;
             }
-            int pos = 0;
-            int nextPos;
-            uint codePoint = 0;
-
-            len += extraLen;
 
-            while (pos < len)
+            int idx = startIndex;
+            while (idx < buffer.Length)
             {
-                byte firstByte = buf[pos];
+                byte firstByte = buffer[idx];
 
                 while (firstByte < 0b10000000)
                 {
-                    if (++pos == len)
+                    if (++idx == buffer.Length)
                     {
-                        return buf + len;
+                        return buffer.Length;
                     }
-                    firstByte = buf[pos];
+                    firstByte = buffer[idx];
                 }
 
                 if ((firstByte & 0b11100000) == 0b11000000)
                 {
-                    nextPos = pos + 2;
-                    if (nextPos > len)
+                    int nextIdx = idx + 2;
+                    if (nextIdx > buffer.Length)
                     {
-                        return buf + pos;
+                        return idx;
                     } // Too short
-                    if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+                    if ((buffer[idx + 1] & 0b11000000) != 0b10000000)
                     {
-                        return buf + pos;
+                        return idx;
                     } // Too short
                     // range check
-                    codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buf[pos + 1] & 0b00111111);
+                    uint codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buffer[idx + 1] & 0b00111111);
                     if ((codePoint < 0x80) || (0x7ff < codePoint))
                     {
-                        return buf + pos;
+                        return idx;
                     } // Overlong
+                    idx = nextIdx;
                 }
                 else if ((firstByte & 0b11110000) == 0b11100000)
                 {
-                    nextPos = pos + 3;
-                    if (nextPos > len)
+                    int nextIdx = idx + 3;
+                    if (nextIdx > buffer.Length)
                     {
-                        return buf + pos;
+                        return idx;
                     } // Too short
                     // range check
-                    codePoint = (uint)(firstByte & 0b00001111) << 12 |
-                                 (uint)(buf[pos + 1] & 0b00111111) << 6 |
-                                 (uint)(buf[pos + 2] & 0b00111111);
+                    uint codePoint = (uint)(firstByte & 0b00001111) << 12 |
+                                     (uint)(buffer[idx + 1] & 0b00111111) << 6 |
+                                     (uint)(buffer[idx + 2] & 0b00111111);
                     // Either overlong or too large:
                     if ((codePoint < 0x800) || (0xffff < codePoint) ||
                         (0xd7ff < codePoint && codePoint < 0xe000))
                     {
-                        return buf + pos;
+                        return idx;
                     }
-                    if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+                    if ((buffer[idx + 1] & 0b11000000) != 0b10000000)
                     {
-                        return buf + pos;
+                        return idx;
                     } // Too short
-                    if ((buf[pos + 2] & 0b11000000) != 0b10000000)
+                    if ((buffer[idx + 2] & 0b11000000) != 0b10000000)
                     {
-                        return buf + pos;
+                        return idx;
                     } // Too short
+                    idx = nextIdx;
                 }
                 else if ((firstByte & 0b11111000) == 0b11110000)
                 {
-                    nextPos = pos + 4;
-                    if (nextPos > len)
+                    int nextIdx = idx + 4;
+                    if (nextIdx > buffer.Length)
                     {
-                        return buf + pos;
+                        return idx;
                     }
-                    if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+                    if ((buffer[idx + 1] & 0b11000000) != 0b10000000)
                     {
-                        return buf + pos;
+                        return idx;
                     }
-                    if ((buf[pos + 2] & 0b11000000) != 0b10000000)
+                    if ((buffer[idx + 2] & 0b11000000) != 0b10000000)
                     {
-                        return buf + pos;
+                        return idx;
                     }
-                    if ((buf[pos + 3] & 0b11000000) != 0b10000000)
+                    if ((buffer[idx + 3] & 0b11000000) != 0b10000000)
                     {
-                        return buf + pos;
+                        return idx;
                     }
                     // range check
-                    codePoint =
-                        (uint)(firstByte & 0b00000111) << 18 | (uint)(buf[pos + 1] & 0b00111111) << 12 |
-                        (uint)(buf[pos + 2] & 0b00111111) << 6 | (uint)(buf[pos + 3] & 0b00111111);
+                    uint codePoint =
+                        (uint)(firstByte & 0b00000111) << 18 | (uint)(buffer[idx + 1] & 0b00111111) << 12 |
+                        (uint)(buffer[idx + 2] & 0b00111111) << 6 | (uint)(buffer[idx + 3] & 0b00111111);
                     if (codePoint <= 0xffff || 0x10ffff < codePoint)
                     {
-                        return buf + pos;
+                        return idx;
                     }
+                    idx = nextIdx;
                 }
                 else
                 {
                     // we may have a continuation/too long error
-                    return buf + pos;
+                    return idx;
                 }
-                pos = nextPos;
             }
 
-            return buf + len; // no error
+            return buffer.Length; // no error
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int numFourByteSequences, ref int numContinuationBytes)
+        private static void AdjustCounters(ReadOnlySpan<byte> buffer, int processedIndex, int invalidIndex, ref int numFourByteSequences, ref int numContinuationBytes)
         {
-            if (pInvalid < pProcessed)
+            if (invalidIndex < processedIndex)
             {
-                for (byte* p = pInvalid; p < pProcessed; p++)
+                for (int i = invalidIndex; i < processedIndex; i++)
                 {
-                    if ((*p & 0b11000000) == 0b10000000)
+                    if ((buffer[i] & 0b11000000) == 0b10000000)
                     {
                         numContinuationBytes -= 1;
                     }
-                    if ((*p & 0b11110000) == 0b11110000)
+                    if ((buffer[i] & 0b11110000) == 0b11110000)
                     {
                         numFourByteSequences -= 1;
                     }
@@ -1185,13 +1184,13 @@ private static void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int num
             }
             else
             {
-                for (byte* p = pProcessed; p < pInvalid; p++)
+                for (int i = processedIndex; i < invalidIndex; i++)
                 {
-                    if ((*p & 0b11000000) == 0b10000000)
+                    if ((buffer[i] & 0b11000000) == 0b10000000)
                     {
                         numContinuationBytes += 1;
                     }
-                    if ((*p & 0b11110000) == 0b11110000)
+                    if ((buffer[i] & 0b11110000) == 0b11110000)
                     {
                         numFourByteSequences += 1;
                     }

From 7f03ab4b2d65492405078efac824336e1f680e78 Mon Sep 17 00:00:00 2001
From: Yat Long Poon <56300571+ylpoonlg@users.noreply.github.com>
Date: Tue, 24 Mar 2026 15:31:36 +0000
Subject: [PATCH 6/8] Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../src/System/Text/Unicode/Utf8Utility.Validation.cs       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index e72eb43c52b572..cbcb5990945fb8 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -785,7 +785,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
 
             Vector128<byte> prevInputBlock = Vector128<byte>.Zero;
 
-            // This is used to detect whether the previous block of contains incomplete sequences.
+            // This is used to detect whether the previous block contains incomplete sequences.
             // It contains the maximum values the previous bytes can be without generating a carry.
             // If we see larger values, it means we need to go back and validate.
             // The first 13 bytes can never generate a carry for a valid UTF-8 byte sequence, the
@@ -946,7 +946,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     Vector128<byte> twoBytesError = byte1High & byte1Low & byte2High;
 
                     // Check if the sequences with two continuation bytes are valid.
-                    // This is only possible for 3 or 4-byte sequences, then we match the expected occurences
+                    // This is only possible for 3 or 4-byte sequences, then we match the expected occurrences
                     // against the MSB from the table lookup results.
                     Vector128<byte> isThirdByte = Vector128.SubtractSaturate(prev2, thirdByte);
                     Vector128<byte> isFourthByte = Vector128.SubtractSaturate(prev3, fourthByte);
@@ -968,7 +968,7 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
 
                     if (AdvSimd.Arm64.IsSupported)
                     {
-                        vecContinuationBytes += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestContinuationByte);
+                        vecContinuationBytes += Vector128.LessThanOrEqual(currentBlock.AsSByte(), largestContinuationByte);
                         vecFourByteSequences += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
                         overflowCounter++;
                         // We have a risk of overflow if overflowCounter reaches 127,

From 034c620a422fb7dc1ab4f3796dd78924060b5097 Mon Sep 17 00:00:00 2001
From: Yat Long Poon <poon.yat.long@gmail.com>
Date: Mon, 18 May 2026 15:19:01 +0100
Subject: [PATCH 7/8] Fix byte sequence array in tests

---
 .../Unicode/Utf8UtilityTests.ValidateBytes.cs | 39 +++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
index 0cfbb8afb6b847..1d4f3cfadbbdc3 100644
--- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
+++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
@@ -2,7 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Buffers;
-using System.Collections;
+using System.Collections.Generic;
 using System.Linq;
 using System.Reflection;
 using System.Runtime.InteropServices;
@@ -22,6 +22,10 @@ public class Utf8UtilityTests
         private const string EURO_SYMBOL = "E282AC"; // U+20AC EURO SIGN, 3 bytes
         private const string GRINNING_FACE = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes
 
+        private static readonly byte[] validTwoByteSequence = Utf8Tests.DecodeHex(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE);
+        private static readonly byte[] validThreeByteSequence = Utf8Tests.DecodeHex(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL);
+        private static readonly byte[] validFourByteSequence = Utf8Tests.DecodeHex(GRINNING_FACE + GRINNING_FACE + GRINNING_FACE + GRINNING_FACE);
+
         [Theory]
         [InlineData("", 0, 0)] // empty string is OK
         [InlineData(X, 1, 0)]
@@ -289,20 +293,20 @@ private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence)
 
             // Exercise the vectorized codepath and insert the invalid sequence at different positions.
 
-            byte[] byteVector = Utf8Tests.DecodeHex(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE);
-
             for (int pos = 0; pos <= 16; pos++)
             {
-                ArrayList testList = new ArrayList(byteVector);
+                List<byte> testList = new List<byte>(validTwoByteSequence);
 
                 if (pos % 2 != 0)
                 {
                     // Replace bytes with valid ASCII characters so they can be broken up.
-                    testList.SetRange(pos - pos % 2, new byte[2] {0x20, 0x21});
+                    int replacementStart = pos - pos % 2;
+                    testList[replacementStart] = 0x20;
+                    testList[replacementStart + 1] = 0x21;
                 }
 
                 testList.InsertRange(pos, invalidSequence);
-                byte[] toTest = (byte[])testList.ToArray(typeof(byte));
+                byte[] toTest = testList.ToArray();
                 GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos + 1) / 2, 0);
             }
         }
@@ -313,20 +317,21 @@ private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence)
 
             // Exercise the vectorized codepath and insert the invalid sequence at different positions.
 
-            byte[] byteVector = Utf8Tests.DecodeHex(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL);
-
             for (int pos = 0; pos <= 16; pos++)
             {
-                ArrayList testList = new ArrayList(byteVector);
+                List<byte> testList = new List<byte>(validThreeByteSequence);
 
                 if (pos % 3 != 0)
                 {
                     // Replace bytes with valid ASCII characters so they can be broken up.
-                    testList.SetRange(pos - pos % 3, new byte[3] {0x20, 0x21, 0x22});
+                    int replacementStart = pos - pos % 3;
+                    testList[replacementStart] = 0x20;
+                    testList[replacementStart + 1] = 0x21;
+                    testList[replacementStart + 2] = 0x22;
                 }
 
                 testList.InsertRange(pos, invalidSequence);
-                byte[] toTest = (byte[])testList.ToArray(typeof(byte));
+                byte[] toTest = testList.ToArray();
                 GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 3) + (pos % 3), 0);
             }
         }
@@ -337,20 +342,22 @@ private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence)
 
             // Exercise the vectorized codepath and insert the invalid sequence at different positions.
 
-            byte[] byteVector = Utf8Tests.DecodeHex(GRINNING_FACE + GRINNING_FACE + GRINNING_FACE + GRINNING_FACE);
-
             for (int pos = 0; pos <= 16; pos++)
             {
-                ArrayList testList = new ArrayList(byteVector);
+                List<byte> testList = new List<byte>(validFourByteSequence);
 
                 if (pos % 4 != 0)
                 {
                     // Replace bytes with valid ASCII characters so they can be broken up.
-                    testList.SetRange(pos - pos % 4, new byte[4] {0x20, 0x21, 0x22, 0x23});
+                    int replacementStart = pos - pos % 4;
+                    testList[replacementStart] = 0x20;
+                    testList[replacementStart + 1] = 0x21;
+                    testList[replacementStart + 2] = 0x22;
+                    testList[replacementStart + 3] = 0x23;
                 }
 
                 testList.InsertRange(pos, invalidSequence);
-                byte[] toTest = (byte[])testList.ToArray(typeof(byte));
+                byte[] toTest = testList.ToArray();
                 GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 4) + (pos % 4), pos / 4);
             }
         }

From 04be6a73f3670d2731f31d83a4e14b66beaf3342 Mon Sep 17 00:00:00 2001
From: Yat Long Poon <poon.yat.long@gmail.com>
Date: Mon, 18 May 2026 18:43:28 +0100
Subject: [PATCH 8/8] Comment out dead code

---
 .../src/System/Text/Unicode/Utf8Utility.Validation.cs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index cbcb5990945fb8..8f96ae2611b6d8 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -963,11 +963,9 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
 
                     prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue);
 
-                    // For Arm64, use vecContinuationBytes and vecFourByteSequences to accumulate the sum for better performance.
-                    // Otherwise, increment the adjustments directly on every iteration.
-
                     if (AdvSimd.Arm64.IsSupported)
                     {
+                        // For Arm64, use vecContinuationBytes and vecFourByteSequences to accumulate the sum for better performance.
                         vecContinuationBytes += Vector128.LessThanOrEqual(currentBlock.AsSByte(), largestContinuationByte);
                         vecFourByteSequences += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
                         overflowCounter++;
@@ -990,8 +988,11 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
                     }
                     else
                     {
-                        numContinuationBytes += Vector128.CountWhereAllBitsSet(byte2High);
-                        numFourByteSequences += Vector128.CountWhereAllBitsSet(Vector128.SubtractSaturate(currentBlock, fourthByte));
+                        // Otherwise, increment the adjustments directly on every iteration.
+                        // TODO: Support other architectures using CountWhereAllBitsSet.
+                        // numContinuationBytes += Vector128.CountWhereAllBitsSet(byte2High);
+                        // numFourByteSequences += Vector128.CountWhereAllBitsSet(Vector128.SubtractSaturate(currentBlock, fourthByte));
+                        throw new PlatformNotSupportedException();
                     }
                 }
             }