From ca8f6ad78ed39e1dd3b601049cc7a90f01324271 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Mon, 3 Nov 2025 10:22:26 +0000 Subject: [PATCH 1/8] Integrate SimdUnicode UTF-8 Validation for AdvSimd --- THIRD-PARTY-NOTICES.TXT | 29 ++ .../src/System/Text/Ascii.Utility.cs | 2 +- .../Text/Unicode/Utf8Utility.Validation.cs | 428 +++++++++++++++++- .../Unicode/Utf8UtilityTests.ValidateBytes.cs | 81 ++-- 4 files changed, 508 insertions(+), 32 deletions(-) diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index 095be6312166ed..01ef9b39dc7198 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -720,6 +720,35 @@ License for fastmod (https://github.com/lemire/fastmod), ibm-fpgen (https://gith See the License for the specific language governing permissions and limitations under the License. +License for SimdUnicode (https://github.com/simdutf/SimdUnicode) +-------------------------------------- + + Copyright 2025 Daniel Lemire, Nick Nuon + Which is based on "Validating UTF-8 In Less Than One Instruction Per Byte" article available at https://arxiv.org/abs/2010.03090 + (c) John Keiser, Daniel Lemire + + MIT License + + Copyright (c) 2023 SimdUnicode authors + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + License for sse4-strstr (https://github.com/WojciechMula/sse4-strstr) -------------------------------------- diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index cdec5bb675b312..4982b1d6598ea8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1524,7 +1524,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii #if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) + internal static bool VectorContainsNonAsciiChar(Vector128 asciiVector) { // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 821037a538b3c8..4dac716564b2f8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -67,6 +67,13 @@ internal static unsafe partial class Utf8Utility byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds #endif +#if NET + if (inputLength >= Vector128.Count && (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) + { + return GetPointerToFirstInvalidByte_Vector128(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + } +#endif + while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar. @@ -138,7 +145,6 @@ internal static unsafe partial class Utf8Utility trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask) >> 2; goto LoopTerminatedEarlyDueToNonAsciiData; } - pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop); } @@ -756,12 +762,430 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit { throw new PlatformNotSupportedException(); } - Vector128 mostSignificantBitIsSet = (value.AsSByte() >> 7).AsByte(); Vector128 extractedBits = mostSignificantBitIsSet & bitMask128; extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); return extractedBits.AsUInt64().ToScalar(); } + + // The following algorithm based on the SimdUnicode library: + // https://github.com/simdutf/SimdUnicode + // by Daniel Lemire and Nick Nuon + // And the paper "Validating UTF-8 In Less Than One Instruction Per Byte": + // https://arxiv.org/pdf/2010.03090 + // by John Keiser and Daniel Lemire, 2021 + + [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + private static byte* GetPointerToFirstInvalidByte_Vector128(byte* pInputBuffer, int inputLength, + out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) + { + Debug.Assert(inputLength >= Vector128.Count); + + Vector128 prevInputBlock = Vector128.Zero; + + Vector128 maxValue = Vector128.Create( + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); + Vector128 prevIncomplete = Vector128.SubtractSaturate(prevInputBlock, maxValue); + int numIncomplete; // maximum number of incomplete bytes to go back + + const byte TOO_SHORT = 1 << 0; + const byte TOO_LONG = 1 << 1; + const byte OVERLONG_3 = 1 << 2; + const byte SURROGATE = 1 << 4; + const byte OVERLONG_2 = 1 << 5; + const byte TWO_CONTS = 1 << 7; + const byte TOO_LARGE = 1 << 3; + const byte TOO_LARGE_1000 = 1 << 6; + const byte OVERLONG_4 = 1 << 6; + const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; + + Vector128 shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + TOO_SHORT | OVERLONG_2, + TOO_SHORT, + TOO_SHORT | OVERLONG_3 | SURROGATE, + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + + Vector128 shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + CARRY | OVERLONG_2, + CARRY, + CARRY, + CARRY | TOO_LARGE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + + Vector128 shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + + Vector128 thirdByte = Vector128.Create((byte)(0b11100000u - 0x80)); + Vector128 fourthByte = Vector128.Create((byte)(0b11110000u - 0x80)); + Vector128 v0f = Vector128.Create((byte)0x0F); + Vector128 v80 = Vector128.Create((byte)0x80); + Vector128 fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1)); + Vector128 largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111 + + int contbytes = 0; // number of continuation bytes in the block + int n4 = 0; // number of 4-byte sequences that start in this block + + // For Arm64: + // Instead of updating n4 and contbytes continuously, we accumulate + // the values in n4v and contv, while using overflowCounter to make + // sure we do not overflow. This allows you to reach good performance + // on systems where summing across vectors is slow. + Vector128 n4v = Vector128.Zero; + Vector128 contv = Vector128.Zero; + int overflowCounter = 0; + + int processedLength = 0; + for (; processedLength <= inputLength - Vector128.Count; processedLength += Vector128.Count) + { + + Vector128 currentBlock = Vector128.Load(pInputBuffer + processedLength); + if (!Ascii.VectorContainsNonAsciiChar(currentBlock)) + { + // We have an ASCII block, no need to process it, but + // we need to check if the previous block was incomplete. + if (prevIncomplete != Vector128.Zero) + { + numIncomplete = Vector128.Count - 3; + goto RewindPointerAndAdjustCounters; + } + + // Often, we have a lot of ASCII characters in a row. + int localasciirun = Vector128.Count; + if (processedLength + localasciirun + Vector128.Count <= inputLength) + { + Vector128 block = Vector128.Load(pInputBuffer + processedLength + localasciirun); + if (!Ascii.VectorContainsNonAsciiChar(block)) + { + localasciirun += Vector128.Count; + for (; localasciirun <= inputLength - processedLength - (Vector128.Count * 4); localasciirun += (Vector128.Count * 4)) + { + Vector128 block1 = Vector128.Load(pInputBuffer + processedLength + localasciirun); + Vector128 block2 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128.Count * 1)); + Vector128 block3 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128.Count * 2)); + Vector128 block4 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128.Count * 3)); + if (Ascii.VectorContainsNonAsciiChar(block1 | block2 | block3 | block4)) + { + break; + } + } + } + processedLength += localasciirun - Vector128.Count; + } + } + else + { + // Contains non-ASCII characters, we need to do non-trivial processing + + Vector128 prev1; + Vector128 prev2; + Vector128 prev3; + Vector128 byte1High; + Vector128 byte1Low; + Vector128 byte2High; + + // TODO: Support for other architectures can be added here. + + if (AdvSimd.Arm64.IsSupported) + { + prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1)); + byte1High = AdvSimd.Arm64.VectorTableLookup(shuf1, prev1 >>> 4); + byte1Low = AdvSimd.Arm64.VectorTableLookup(shuf2, (prev1 & v0f)); + byte2High = AdvSimd.Arm64.VectorTableLookup(shuf3, currentBlock >>> 4); + prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2)); + prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3)); + } + else + { + throw new PlatformNotSupportedException(); + } + + prevInputBlock = currentBlock; + Vector128 twoBytesError = byte1High & byte1Low & byte2High; + Vector128 isThirdByte = Vector128.SubtractSaturate(prev2, thirdByte); + Vector128 isFourthByte = Vector128.SubtractSaturate(prev3, fourthByte); + Vector128 must23 = isThirdByte | isFourthByte; + Vector128 must23As80 = must23 & v80; + Vector128 error = must23As80 ^ twoBytesError; + if (error != Vector128.Zero) + { + numIncomplete = (processedLength == 0) ? 0 : 3; + goto RewindPointerAndAdjustCounters; + } + prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue); + + // For Arm64, use contv and n4v to accumulate the sum for better performance. + // Otherwise, increment the adjustments directly on every iteration. + + if (AdvSimd.Arm64.IsSupported) + { + contv += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont); + n4v += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte(); + overflowCounter++; + // We have a risk of overflow if overflowCounter reaches 255, + // in which case, we empty contv and n4v, and update contbytes and + // n4. + if (overflowCounter == 0xff) + { + overflowCounter = 0; + contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar(); + contv = Vector128.Zero; + if (n4v != Vector128.Zero) + { + n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar(); + n4v = Vector128.Zero; + } + } + } + else + { + contbytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits()); + n4 += BitOperations.PopCount(Vector128.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits()); + } + } + } + + bool hasIncomplete = prevIncomplete != Vector128.Zero; + if (processedLength == inputLength && !hasIncomplete) + { + // No invalid byte is found across the whole input length. + + if (AdvSimd.Arm64.IsSupported) + { + contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar(); + if (n4v != Vector128.Zero) + { + n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar(); + } + } + else + { + // Do nothing since contbytes and n4 were incremented directly. + } + + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes); + return pInputBuffer + inputLength; + } + + numIncomplete = hasIncomplete ? 3 : 0; + + RewindPointerAndAdjustCounters: + + if (AdvSimd.Arm64.IsSupported) + { + contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar(); + if (n4v != Vector128.Zero) + { + n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar(); + } + } + else + { + // Do nothing since contbytes and n4 were incremented directly. + } + + // Find the first invalid byte, going back if necessary. + // Then, adjust the counters 'n4' and 'contbytes', since we might be + // overcounting or undercounting them during processing. + + byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(numIncomplete, pInputBuffer + processedLength - numIncomplete, inputLength - processedLength + numIncomplete); + + AdjustCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes); + + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes); + return invalidBytePointer; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len) + { + // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of + // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the + // pointer to the first invalid byte. + + int extraLen = 0; + bool foundLeadingBytes = false; + + for (int i = 0; i <= howFarBack; i++) + { + byte candidateByte = buf[0 - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + + if (foundLeadingBytes) + { + buf -= i; + extraLen = i; + break; + } + } + + if (!foundLeadingBytes) + { + return buf - howFarBack; + } + int pos = 0; + int nextPos; + uint codePoint = 0; + + len += extraLen; + + while (pos < len) + { + + byte firstByte = buf[pos]; + + while (firstByte < 0b10000000) + { + if (++pos == len) + { + return buf + len; + } + firstByte = buf[pos]; + } + + if ((firstByte & 0b11100000) == 0b11000000) + { + nextPos = pos + 2; + if (nextPos > len) + { + return buf + pos; + } // Too short + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + { + return buf + pos; + } // Too short + // range check + codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buf[pos + 1] & 0b00111111); + if ((codePoint < 0x80) || (0x7ff < codePoint)) + { + return buf + pos; + } // Overlong + } + else if ((firstByte & 0b11110000) == 0b11100000) + { + nextPos = pos + 3; + if (nextPos > len) + { + return buf + pos; + } // Too short + // range check + codePoint = (uint)(firstByte & 0b00001111) << 12 | + (uint)(buf[pos + 1] & 0b00111111) << 6 | + (uint)(buf[pos + 2] & 0b00111111); + // Either overlong or too large: + if ((codePoint < 0x800) || (0xffff < codePoint) || + (0xd7ff < codePoint && codePoint < 0xe000)) + { + return buf + pos; + } + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + { + return buf + pos; + } // Too short + if ((buf[pos + 2] & 0b11000000) != 0b10000000) + { + return buf + pos; + } // Too short + } + else if ((firstByte & 0b11111000) == 0b11110000) + { + nextPos = pos + 4; + if (nextPos > len) + { + return buf + pos; + } + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + { + return buf + pos; + } + if ((buf[pos + 2] & 0b11000000) != 0b10000000) + { + return buf + pos; + } + if ((buf[pos + 3] & 0b11000000) != 0b10000000) + { + return buf + pos; + } + // range check + codePoint = + (uint)(firstByte & 0b00000111) << 18 | (uint)(buf[pos + 1] & 0b00111111) << 12 | + (uint)(buf[pos + 2] & 0b00111111) << 6 | (uint)(buf[pos + 3] & 0b00111111); + if (codePoint <= 0xffff || 0x10ffff < codePoint) + { + return buf + pos; + } + } + else + { + // we may have a continuation/too long error + return buf + pos; + } + pos = nextPos; + } + + return buf + len; // no error + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int n4, ref int contbytes) + { + if (pInvalid < pProcessed) + { + for (byte* p = pInvalid; p < pProcessed; p++) + { + if ((*p & 0b11000000) == 0b10000000) + { + contbytes -= 1; + } + if ((*p & 0b11110000) == 0b11110000) + { + n4 -= 1; + } + } + } + else + { + for (byte* p = pProcessed; p < pInvalid; p++) + { + if ((*p & 0b11000000) == 0b10000000) + { + contbytes += 1; + } + if ((*p & 0b11110000) == 0b11110000) + { + n4 += 1; + } + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int n4, int contbytes) + { + int n3 = -2 * n4 + 2 * contbytes; + int n2 = n4 - 3 * contbytes; + int utfadjust = -2 * n4 - 2 * n3 - n2; + int scalaradjust = -n4; + + return (utfadjust, scalaradjust); + } #endif } } diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs index 4730337b0878ed..0cfbb8afb6b847 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Collections; using System.Linq; using System.Reflection; using System.Runtime.InteropServices; @@ -241,6 +242,13 @@ public void GetIndexOfFirstInvalidUtf8Sequence_WithOutOfRangeFourByteSequences_R { AssertIsInvalidFourByteSequence(new byte[] { 0xF4, (byte)i, 0x80, 0x80 }); } + + // [ F5..FF ] [ 80..BF ] [ 80..BF ] [ 80..BF ] is out-of-range 4-byte sequence + + for (int i = 0xF5; i < 0x100; i++) + { + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x80, 0x80, 0x80 }); + } } [Fact] @@ -279,57 +287,72 @@ private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence) { Assert.Equal(2, invalidSequence.Length); - byte[] knownGoodBytes = Utf8Tests.DecodeHex(E_ACUTE); - - byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); + // Exercise the vectorized codepath and insert the invalid sequence at different positions. - toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of first DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 2, 1, 0); + byte[] byteVector = Utf8Tests.DecodeHex(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE); - // Run the same tests but with extra data at the beginning so that we're inside one of - // the 2-byte processing "hot loop" code paths. + for (int pos = 0; pos <= 16; pos++) + { + ArrayList testList = new ArrayList(byteVector); - toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of next DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 2, 0); + if (pos % 2 != 0) + { + // Replace bytes with valid ASCII characters so they can be broken up. + testList.SetRange(pos - pos % 2, new byte[2] {0x20, 0x21}); + } - toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of next DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 3, 0); + testList.InsertRange(pos, invalidSequence); + byte[] toTest = (byte[])testList.ToArray(typeof(byte)); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos + 1) / 2, 0); + } } private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence) { Assert.Equal(3, invalidSequence.Length); - byte[] knownGoodBytes = Utf8Tests.DecodeHex(EURO_SYMBOL); - - byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); + // Exercise the vectorized codepath and insert the invalid sequence at different positions. - // Run the same tests but with extra data at the beginning so that we're inside one of - // the 3-byte processing "hot loop" code paths. + byte[] byteVector = Utf8Tests.DecodeHex(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL); - toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling first and second DWORDs - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 3, 1, 0); + for (int pos = 0; pos <= 16; pos++) + { + ArrayList testList = new ArrayList(byteVector); - toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling second and third DWORDs - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 2, 0); + if (pos % 3 != 0) + { + // Replace bytes with valid ASCII characters so they can be broken up. + testList.SetRange(pos - pos % 3, new byte[3] {0x20, 0x21, 0x22}); + } - toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of third DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 9, 3, 0); + testList.InsertRange(pos, invalidSequence); + byte[] toTest = (byte[])testList.ToArray(typeof(byte)); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 3) + (pos % 3), 0); + } } private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence) { Assert.Equal(4, invalidSequence.Length); - byte[] knownGoodBytes = Utf8Tests.DecodeHex(GRINNING_FACE); + // Exercise the vectorized codepath and insert the invalid sequence at different positions. + + byte[] byteVector = Utf8Tests.DecodeHex(GRINNING_FACE + GRINNING_FACE + GRINNING_FACE + GRINNING_FACE); - byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); + for (int pos = 0; pos <= 16; pos++) + { + ArrayList testList = new ArrayList(byteVector); - toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 1, 1); + if (pos % 4 != 0) + { + // Replace bytes with valid ASCII characters so they can be broken up. + testList.SetRange(pos - pos % 4, new byte[4] {0x20, 0x21, 0x22, 0x23}); + } + + testList.InsertRange(pos, invalidSequence); + byte[] toTest = (byte[])testList.ToArray(typeof(byte)); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 4) + (pos % 4), pos / 4); + } } private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(string inputHex, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount) From a711ed03cb44e89e998237fa4f6ce860346be3aa Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Thu, 18 Dec 2025 10:12:08 +0000 Subject: [PATCH 2/8] Fix overflow counter contv and n4v are Vector128 so the largest positive value is 127. --- .../src/System/Text/Unicode/Utf8Utility.Validation.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 4dac716564b2f8..2f5d21efffad31 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -939,10 +939,10 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit contv += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont); n4v += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte(); overflowCounter++; - // We have a risk of overflow if overflowCounter reaches 255, + // We have a risk of overflow if overflowCounter reaches 127, // in which case, we empty contv and n4v, and update contbytes and // n4. - if (overflowCounter == 0xff) + if (overflowCounter == 0x7f) { overflowCounter = 0; contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar(); From b594b1579c88417494e12a30a45ca1bb05251963 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Wed, 18 Mar 2026 13:31:46 +0000 Subject: [PATCH 3/8] Add comments and fix variable names --- .../Text/Unicode/Utf8Utility.Validation.cs | 152 ++++++++++-------- 1 file changed, 88 insertions(+), 64 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 2f5d21efffad31..ff36204d87ca08 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -145,6 +145,7 @@ internal static unsafe partial class Utf8Utility trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask) >> 2; goto LoopTerminatedEarlyDueToNonAsciiData; } + pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop); } @@ -762,6 +763,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit { throw new PlatformNotSupportedException(); } + Vector128 mostSignificantBitIsSet = (value.AsSByte() >> 7).AsByte(); Vector128 extractedBits = mostSignificantBitIsSet & bitMask128; extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); @@ -783,24 +785,37 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit Vector128 prevInputBlock = Vector128.Zero; + // This is used to detect whether the previous block of contains incomplete sequences. + // It contains the maximum values the previous bytes can be without generating a carry. + // If we see larger values, it means we need to go back and validate. + // The first 13 bytes can never generate a carry for a valid UTF-8 byte sequence, the + // last 3 bytes are the maximum starting byte of a 3-byte, 2-byte and 1-byte sequence + // respectively. Vector128 maxValue = Vector128.Create( 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); Vector128 prevIncomplete = Vector128.SubtractSaturate(prevInputBlock, maxValue); int numIncomplete; // maximum number of incomplete bytes to go back - const byte TOO_SHORT = 1 << 0; - const byte TOO_LONG = 1 << 1; - const byte OVERLONG_3 = 1 << 2; - const byte SURROGATE = 1 << 4; - const byte OVERLONG_2 = 1 << 5; - const byte TWO_CONTS = 1 << 7; - const byte TOO_LARGE = 1 << 3; - const byte TOO_LARGE_1000 = 1 << 6; - const byte OVERLONG_4 = 1 << 6; - const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; - - Vector128 shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // The error bit encoding is slightly different from the paper, instead it follows the + // SimdUnicode implementation. TOO_LARGE_1000 and OVERLONG_4 can share the same bit as + // their conditions are mutually exclusive. + const byte TOO_SHORT = 1 << 0; // Sequence is missing continuation bytes + const byte TOO_LONG = 1 << 1; // ASCII byte is followed by a continuation byte + const byte OVERLONG_3 = 1 << 2; // Character is out-of-range for a 3-byte sequence + const byte SURROGATE = 1 << 4; // Character range is reserved for UTF-16 surrogates + const byte OVERLONG_2 = 1 << 5; // Character is out-of-range for a 2-byte sequence + const byte TWO_CONTS = 1 << 7; // (Not an error) Two continuation bytes + const byte TOO_LARGE = 1 << 3; // Character is larger than the largest Unicode character + const byte TOO_LARGE_1000 = 1 << 6; // Same as TOO_LARGE, but the 2nd byte starts with 0x1000 + const byte OVERLONG_4 = 1 << 6; // Character is out-of-range for a 4-byte sequence + const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // A common case for continuation bytes + + // The 3 lookup tables used to map nibbles of consecutive bytes to possible errors in each case. + // A 4-bit nibble from the upper or lower half of a byte is used as an index (0-16) to lookup the + // corresponding error mask from the 128-bit vector. + + Vector128 tableByte1High = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, TOO_SHORT | OVERLONG_2, @@ -808,7 +823,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit TOO_SHORT | OVERLONG_3 | SURROGATE, TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); - Vector128 shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + Vector128 tableByte1Low = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, CARRY | OVERLONG_2, CARRY, CARRY, @@ -825,7 +840,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000); - Vector128 shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + Vector128 tableByte2High = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, @@ -835,21 +850,21 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit Vector128 thirdByte = Vector128.Create((byte)(0b11100000u - 0x80)); Vector128 fourthByte = Vector128.Create((byte)(0b11110000u - 0x80)); - Vector128 v0f = Vector128.Create((byte)0x0F); - Vector128 v80 = Vector128.Create((byte)0x80); + Vector128 vec0F = Vector128.Create((byte)0x0F); + Vector128 vec80 = Vector128.Create((byte)0x80); Vector128 fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1)); - Vector128 largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111 + Vector128 largestContinuationByte = Vector128.Create((sbyte)-65); // -65 => 0b10111111 - int contbytes = 0; // number of continuation bytes in the block - int n4 = 0; // number of 4-byte sequences that start in this block + int numContinuationBytes = 0; // number of continuation bytes in the block + int numFourByteSequences = 0; // number of 4-byte sequences that start in this block // For Arm64: - // Instead of updating n4 and contbytes continuously, we accumulate - // the values in n4v and contv, while using overflowCounter to make + // Instead of updating numFourByteSequences and numContinuationBytes continuously, we accumulate + // the values in vecFourByteSequences and vecContinuationBytes, while using overflowCounter to make // sure we do not overflow. This allows you to reach good performance // on systems where summing across vectors is slow. - Vector128 n4v = Vector128.Zero; - Vector128 contv = Vector128.Zero; + Vector128 vecFourByteSequences = Vector128.Zero; + Vector128 vecContinuationBytes = Vector128.Zero; int overflowCounter = 0; int processedLength = 0; @@ -906,9 +921,9 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit if (AdvSimd.Arm64.IsSupported) { prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1)); - byte1High = AdvSimd.Arm64.VectorTableLookup(shuf1, prev1 >>> 4); - byte1Low = AdvSimd.Arm64.VectorTableLookup(shuf2, (prev1 & v0f)); - byte2High = AdvSimd.Arm64.VectorTableLookup(shuf3, currentBlock >>> 4); + byte1High = AdvSimd.Arm64.VectorTableLookup(tableByte1High, prev1 >>> 4); + byte1Low = AdvSimd.Arm64.VectorTableLookup(tableByte1Low, (prev1 & vec0F)); + byte2High = AdvSimd.Arm64.VectorTableLookup(tableByte2High, currentBlock >>> 4); prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2)); prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3)); } @@ -918,46 +933,56 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit } prevInputBlock = currentBlock; + + // Find invalid 2-byte sequences by matching the error bits from the table lookups. Vector128 twoBytesError = byte1High & byte1Low & byte2High; + + // Check if the sequences with two continuation bytes are valid. + // This is only possible for 3 or 4-byte sequences, then we match the expected occurences + // against the MSB from the table lookup results. Vector128 isThirdByte = Vector128.SubtractSaturate(prev2, thirdByte); Vector128 isFourthByte = Vector128.SubtractSaturate(prev3, fourthByte); - Vector128 must23 = isThirdByte | isFourthByte; - Vector128 must23As80 = must23 & v80; - Vector128 error = must23As80 ^ twoBytesError; + Vector128 twoContinuationBytes = (isThirdByte | isFourthByte) & vec80; // Extract the MSB + Vector128 error = twoContinuationBytes ^ twoBytesError; + if (error != Vector128.Zero) { + // Error is found if the error bit mask is non-zero. numIncomplete = (processedLength == 0) ? 0 : 3; goto RewindPointerAndAdjustCounters; } + prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue); - // For Arm64, use contv and n4v to accumulate the sum for better performance. + // For Arm64, use vecContinuationBytes and vecFourByteSequences to accumulate the sum for better performance. // Otherwise, increment the adjustments directly on every iteration. if (AdvSimd.Arm64.IsSupported) { - contv += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont); - n4v += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte(); + vecContinuationBytes += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestContinuationByte); + vecFourByteSequences += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte(); overflowCounter++; // We have a risk of overflow if overflowCounter reaches 127, - // in which case, we empty contv and n4v, and update contbytes and - // n4. + // in which case, we empty vecContinuationBytes and vecFourByteSequences, and update numContinuationBytes and + // numFourByteSequences. if (overflowCounter == 0x7f) { overflowCounter = 0; - contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar(); - contv = Vector128.Zero; - if (n4v != Vector128.Zero) + + // The vector results are negative, so subtract to make the scalar positive. + numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar(); + vecContinuationBytes = Vector128.Zero; + if (vecFourByteSequences != Vector128.Zero) { - n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar(); - n4v = Vector128.Zero; + numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar(); + vecFourByteSequences = Vector128.Zero; } } } else { - contbytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits()); - n4 += BitOperations.PopCount(Vector128.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits()); + numContinuationBytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits()); + numFourByteSequences += BitOperations.PopCount(Vector128.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits()); } } } @@ -969,18 +994,18 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit if (AdvSimd.Arm64.IsSupported) { - contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar(); - if (n4v != Vector128.Zero) + numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar(); + if (vecFourByteSequences != Vector128.Zero) { - n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar(); + numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar(); } } else { - // Do nothing since contbytes and n4 were incremented directly. + // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly. } - (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes); return pInputBuffer + inputLength; } @@ -990,26 +1015,26 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit if (AdvSimd.Arm64.IsSupported) { - contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar(); - if (n4v != Vector128.Zero) + numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar(); + if (vecFourByteSequences != Vector128.Zero) { - n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar(); + numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar(); } } else { - // Do nothing since contbytes and n4 were incremented directly. + // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly. } // Find the first invalid byte, going back if necessary. - // Then, adjust the counters 'n4' and 'contbytes', since we might be + // Then, adjust the counters 'numFourByteSequences' and 'numContinuationBytes', since we might be // overcounting or undercounting them during processing. byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(numIncomplete, pInputBuffer + processedLength - numIncomplete, inputLength - processedLength + numIncomplete); - AdjustCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes); + AdjustCounters(pInputBuffer + processedLength, invalidBytePointer, ref numFourByteSequences, ref numContinuationBytes); - (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes); return invalidBytePointer; } @@ -1048,7 +1073,6 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit while (pos < len) { - byte firstByte = buf[pos]; while (firstByte < 0b10000000) @@ -1144,7 +1168,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int n4, ref int contbytes) + private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int numFourByteSequences, ref int numContinuationBytes) { if (pInvalid < pProcessed) { @@ -1152,11 +1176,11 @@ private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref { if ((*p & 0b11000000) == 0b10000000) { - contbytes -= 1; + numContinuationBytes -= 1; } if ((*p & 0b11110000) == 0b11110000) { - n4 -= 1; + numFourByteSequences -= 1; } } } @@ -1166,23 +1190,23 @@ private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref { if ((*p & 0b11000000) == 0b10000000) { - contbytes += 1; + numContinuationBytes += 1; } if ((*p & 0b11110000) == 0b11110000) { - n4 += 1; + numFourByteSequences += 1; } } } } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int n4, int contbytes) + private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int numFourByteSequences, int numContinuationBytes) { - int n3 = -2 * n4 + 2 * contbytes; - int n2 = n4 - 3 * contbytes; - int utfadjust = -2 * n4 - 2 * n3 - n2; - int scalaradjust = -n4; + int n3 = -2 * numFourByteSequences + 2 * numContinuationBytes; + int n2 = numFourByteSequences - 3 * numContinuationBytes; + int utfadjust = -2 * numFourByteSequences - 2 * n3 - n2; + int scalaradjust = -numFourByteSequences; return (utfadjust, scalaradjust); } From 924598d52f06961326657034e59a55b091d8e3fa Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Wed, 18 Mar 2026 15:54:56 +0000 Subject: [PATCH 4/8] Address code review suggestions --- .../Text/Unicode/Utf8Utility.Validation.cs | 87 +++++++++---------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index ff36204d87ca08..4427324c5f3f7e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -795,7 +795,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); Vector128 prevIncomplete = Vector128.SubtractSaturate(prevInputBlock, maxValue); - int numIncomplete; // maximum number of incomplete bytes to go back + int numIncomplete = 0; // maximum number of incomplete bytes to go back // The error bit encoding is slightly different from the paper, instead it follows the // SimdUnicode implementation. TOO_LARGE_1000 and OVERLONG_4 can share the same bit as @@ -867,10 +867,11 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit Vector128 vecContinuationBytes = Vector128.Zero; int overflowCounter = 0; + bool foundError = false; + int processedLength = 0; for (; processedLength <= inputLength - Vector128.Count; processedLength += Vector128.Count) { - Vector128 currentBlock = Vector128.Load(pInputBuffer + processedLength); if (!Ascii.VectorContainsNonAsciiChar(currentBlock)) { @@ -879,30 +880,37 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit if (prevIncomplete != Vector128.Zero) { numIncomplete = Vector128.Count - 3; - goto RewindPointerAndAdjustCounters; + foundError = true; + break; } // Often, we have a lot of ASCII characters in a row. - int localasciirun = Vector128.Count; - if (processedLength + localasciirun + Vector128.Count <= inputLength) + if (processedLength + (Vector128.Count * 2) <= inputLength) { - Vector128 block = Vector128.Load(pInputBuffer + processedLength + localasciirun); + byte* pAsciiRunStart = pInputBuffer + processedLength; + byte* pAsciiRunCurrent = pAsciiRunStart + Vector128.Count; // The first block is already checked + + Vector128 block = Vector128.Load(pAsciiRunCurrent); if (!Ascii.VectorContainsNonAsciiChar(block)) { - localasciirun += Vector128.Count; - for (; localasciirun <= inputLength - processedLength - (Vector128.Count * 4); localasciirun += (Vector128.Count * 4)) + // If we see more ASCII characters, unroll the loop by 4. + + byte* pAsciiRunEnd = pAsciiRunStart + inputLength - processedLength - (Vector128.Count * 4); + + for (; pAsciiRunCurrent <= pAsciiRunEnd; pAsciiRunCurrent += (Vector128.Count * 4)) { - Vector128 block1 = Vector128.Load(pInputBuffer + processedLength + localasciirun); - Vector128 block2 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128.Count * 1)); - Vector128 block3 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128.Count * 2)); - Vector128 block4 = Vector128.Load(pInputBuffer + processedLength + localasciirun + (Vector128.Count * 3)); + Vector128 block1 = Vector128.Load(pAsciiRunCurrent); + Vector128 block2 = Vector128.Load(pAsciiRunCurrent + (Vector128.Count * 1)); + Vector128 block3 = Vector128.Load(pAsciiRunCurrent + (Vector128.Count * 2)); + Vector128 block4 = Vector128.Load(pAsciiRunCurrent + (Vector128.Count * 3)); if (Ascii.VectorContainsNonAsciiChar(block1 | block2 | block3 | block4)) { break; } } } - processedLength += localasciirun - Vector128.Count; + + processedLength += (int)(pAsciiRunCurrent - pAsciiRunStart) - Vector128.Count; } } else @@ -949,7 +957,8 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit { // Error is found if the error bit mask is non-zero. numIncomplete = (processedLength == 0) ? 0 : 3; - goto RewindPointerAndAdjustCounters; + foundError = true; + break; } prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue); @@ -981,38 +990,13 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit } else { - numContinuationBytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits()); - numFourByteSequences += BitOperations.PopCount(Vector128.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits()); + numContinuationBytes += Vector128.CountWhereAllBitsSet(byte2High); + numFourByteSequences += Vector128.CountWhereAllBitsSet(Vector128.SubtractSaturate(currentBlock, fourthByte)); } } } - bool hasIncomplete = prevIncomplete != Vector128.Zero; - if (processedLength == inputLength && !hasIncomplete) - { - // No invalid byte is found across the whole input length. - - if (AdvSimd.Arm64.IsSupported) - { - numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar(); - if (vecFourByteSequences != Vector128.Zero) - { - numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar(); - } - } - else - { - // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly. - } - - (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes); - return pInputBuffer + inputLength; - } - - numIncomplete = hasIncomplete ? 3 : 0; - - RewindPointerAndAdjustCounters: - + // Sum up the remaining values from vecContinuationBytes and vecFourByteSequences. if (AdvSimd.Arm64.IsSupported) { numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar(); @@ -1026,6 +1010,21 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly. } + if (!foundError) + { + bool hasIncomplete = prevIncomplete != Vector128.Zero; + if (processedLength == inputLength && !hasIncomplete) + { + // No invalid byte is found across the whole input length. + + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes); + return pInputBuffer + inputLength; + } + + // Still has incomplete data to validate. + numIncomplete = hasIncomplete ? 3 : 0; + } + // Find the first invalid byte, going back if necessary. // Then, adjust the counters 'numFourByteSequences' and 'numContinuationBytes', since we might be // overcounting or undercounting them during processing. @@ -1039,7 +1038,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len) + private static byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len) { // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the @@ -1168,7 +1167,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int numFourByteSequences, ref int numContinuationBytes) + private static void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int numFourByteSequences, ref int numContinuationBytes) { if (pInvalid < pProcessed) { From 6aecaaf6e2f459a420cef8b1fbe6f39c3ed21145 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Thu, 19 Mar 2026 12:21:53 +0000 Subject: [PATCH 5/8] Replace pointers with spans in helpers --- .../Text/Unicode/Utf8Utility.Validation.cs | 127 +++++++++--------- 1 file changed, 63 insertions(+), 64 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 4427324c5f3f7e..e72eb43c52b572 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -879,7 +879,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit // we need to check if the previous block was incomplete. if (prevIncomplete != Vector128.Zero) { - numIncomplete = Vector128.Count - 3; + numIncomplete = 3; foundError = true; break; } @@ -1029,155 +1029,154 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit // Then, adjust the counters 'numFourByteSequences' and 'numContinuationBytes', since we might be // overcounting or undercounting them during processing. - byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(numIncomplete, pInputBuffer + processedLength - numIncomplete, inputLength - processedLength + numIncomplete); + ReadOnlySpan inputSpan = new ReadOnlySpan(pInputBuffer, inputLength); - AdjustCounters(pInputBuffer + processedLength, invalidBytePointer, ref numFourByteSequences, ref numContinuationBytes); + int invalidIndex = SimpleRewindAndValidateWithErrors(numIncomplete, inputSpan, processedLength - numIncomplete); + byte* invalidBytePointer = pInputBuffer + invalidIndex; + + AdjustCounters(inputSpan, processedLength, invalidIndex, ref numFourByteSequences, ref numContinuationBytes); (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes); return invalidBytePointer; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len) + private static int SimpleRewindAndValidateWithErrors(int howFarBack, ReadOnlySpan buffer, int startIndex) { - // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of - // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the - // pointer to the first invalid byte. + // We scan from startIndex forward, possibly going back howFarBack bytes, to find the end of + // a valid UTF-8 sequence. We return buffer.Length if the buffer is valid, otherwise we return + // the index of the first invalid byte - int extraLen = 0; bool foundLeadingBytes = false; for (int i = 0; i <= howFarBack; i++) { - byte candidateByte = buf[0 - i]; + byte candidateByte = buffer[startIndex - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) { - buf -= i; - extraLen = i; + startIndex -= i; break; } } if (!foundLeadingBytes) { - return buf - howFarBack; + return startIndex - howFarBack; } - int pos = 0; - int nextPos; - uint codePoint = 0; - - len += extraLen; - while (pos < len) + int idx = startIndex; + while (idx < buffer.Length) { - byte firstByte = buf[pos]; + byte firstByte = buffer[idx]; while (firstByte < 0b10000000) { - if (++pos == len) + if (++idx == buffer.Length) { - return buf + len; + return buffer.Length; } - firstByte = buf[pos]; + firstByte = buffer[idx]; } if ((firstByte & 0b11100000) == 0b11000000) { - nextPos = pos + 2; - if (nextPos > len) + int nextIdx = idx + 2; + if (nextIdx > buffer.Length) { - return buf + pos; + return idx; } // Too short - if ((buf[pos + 1] & 0b11000000) != 0b10000000) + if ((buffer[idx + 1] & 0b11000000) != 0b10000000) { - return buf + pos; + return idx; } // Too short // range check - codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buf[pos + 1] & 0b00111111); + uint codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buffer[idx + 1] & 0b00111111); if ((codePoint < 0x80) || (0x7ff < codePoint)) { - return buf + pos; + return idx; } // Overlong + idx = nextIdx; } else if ((firstByte & 0b11110000) == 0b11100000) { - nextPos = pos + 3; - if (nextPos > len) + int nextIdx = idx + 3; + if (nextIdx > buffer.Length) { - return buf + pos; + return idx; } // Too short // range check - codePoint = (uint)(firstByte & 0b00001111) << 12 | - (uint)(buf[pos + 1] & 0b00111111) << 6 | - (uint)(buf[pos + 2] & 0b00111111); + uint codePoint = (uint)(firstByte & 0b00001111) << 12 | + (uint)(buffer[idx + 1] & 0b00111111) << 6 | + (uint)(buffer[idx + 2] & 0b00111111); // Either overlong or too large: if ((codePoint < 0x800) || (0xffff < codePoint) || (0xd7ff < codePoint && codePoint < 0xe000)) { - return buf + pos; + return idx; } - if ((buf[pos + 1] & 0b11000000) != 0b10000000) + if ((buffer[idx + 1] & 0b11000000) != 0b10000000) { - return buf + pos; + return idx; } // Too short - if ((buf[pos + 2] & 0b11000000) != 0b10000000) + if ((buffer[idx + 2] & 0b11000000) != 0b10000000) { - return buf + pos; + return idx; } // Too short + idx = nextIdx; } else if ((firstByte & 0b11111000) == 0b11110000) { - nextPos = pos + 4; - if (nextPos > len) + int nextIdx = idx + 4; + if (nextIdx > buffer.Length) { - return buf + pos; + return idx; } - if ((buf[pos + 1] & 0b11000000) != 0b10000000) + if ((buffer[idx + 1] & 0b11000000) != 0b10000000) { - return buf + pos; + return idx; } - if ((buf[pos + 2] & 0b11000000) != 0b10000000) + if ((buffer[idx + 2] & 0b11000000) != 0b10000000) { - return buf + pos; + return idx; } - if ((buf[pos + 3] & 0b11000000) != 0b10000000) + if ((buffer[idx + 3] & 0b11000000) != 0b10000000) { - return buf + pos; + return idx; } // range check - codePoint = - (uint)(firstByte & 0b00000111) << 18 | (uint)(buf[pos + 1] & 0b00111111) << 12 | - (uint)(buf[pos + 2] & 0b00111111) << 6 | (uint)(buf[pos + 3] & 0b00111111); + uint codePoint = + (uint)(firstByte & 0b00000111) << 18 | (uint)(buffer[idx + 1] & 0b00111111) << 12 | + (uint)(buffer[idx + 2] & 0b00111111) << 6 | (uint)(buffer[idx + 3] & 0b00111111); if (codePoint <= 0xffff || 0x10ffff < codePoint) { - return buf + pos; + return idx; } + idx = nextIdx; } else { // we may have a continuation/too long error - return buf + pos; + return idx; } - pos = nextPos; } - return buf + len; // no error + return buffer.Length; // no error } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int numFourByteSequences, ref int numContinuationBytes) + private static void AdjustCounters(ReadOnlySpan buffer, int processedIndex, int invalidIndex, ref int numFourByteSequences, ref int numContinuationBytes) { - if (pInvalid < pProcessed) + if (invalidIndex < processedIndex) { - for (byte* p = pInvalid; p < pProcessed; p++) + for (int i = invalidIndex; i < processedIndex; i++) { - if ((*p & 0b11000000) == 0b10000000) + if ((buffer[i] & 0b11000000) == 0b10000000) { numContinuationBytes -= 1; } - if ((*p & 0b11110000) == 0b11110000) + if ((buffer[i] & 0b11110000) == 0b11110000) { numFourByteSequences -= 1; } @@ -1185,13 +1184,13 @@ private static void AdjustCounters(byte* pProcessed, byte* pInvalid, ref int num } else { - for (byte* p = pProcessed; p < pInvalid; p++) + for (int i = processedIndex; i < invalidIndex; i++) { - if ((*p & 0b11000000) == 0b10000000) + if ((buffer[i] & 0b11000000) == 0b10000000) { numContinuationBytes += 1; } - if ((*p & 0b11110000) == 0b11110000) + if ((buffer[i] & 0b11110000) == 0b11110000) { numFourByteSequences += 1; } From 7f03ab4b2d65492405078efac824336e1f680e78 Mon Sep 17 00:00:00 2001 From: Yat Long Poon <56300571+ylpoonlg@users.noreply.github.com> Date: Tue, 24 Mar 2026 15:31:36 +0000 Subject: [PATCH 6/8] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../src/System/Text/Unicode/Utf8Utility.Validation.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index e72eb43c52b572..cbcb5990945fb8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -785,7 +785,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit Vector128 prevInputBlock = Vector128.Zero; - // This is used to detect whether the previous block of contains incomplete sequences. + // This is used to detect whether the previous block contains incomplete sequences. // It contains the maximum values the previous bytes can be without generating a carry. // If we see larger values, it means we need to go back and validate. // The first 13 bytes can never generate a carry for a valid UTF-8 byte sequence, the @@ -946,7 +946,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit Vector128 twoBytesError = byte1High & byte1Low & byte2High; // Check if the sequences with two continuation bytes are valid. - // This is only possible for 3 or 4-byte sequences, then we match the expected occurences + // This is only possible for 3 or 4-byte sequences, then we match the expected occurrences // against the MSB from the table lookup results. Vector128 isThirdByte = Vector128.SubtractSaturate(prev2, thirdByte); Vector128 isFourthByte = Vector128.SubtractSaturate(prev3, fourthByte); @@ -968,7 +968,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit if (AdvSimd.Arm64.IsSupported) { - vecContinuationBytes += Vector128.LessThanOrEqual(Vector128.AsSByte(currentBlock), largestContinuationByte); + vecContinuationBytes += Vector128.LessThanOrEqual(currentBlock.AsSByte(), largestContinuationByte); vecFourByteSequences += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte(); overflowCounter++; // We have a risk of overflow if overflowCounter reaches 127, From 034c620a422fb7dc1ab4f3796dd78924060b5097 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Mon, 18 May 2026 15:19:01 +0100 Subject: [PATCH 7/8] Fix byte sequence array in tests --- .../Unicode/Utf8UtilityTests.ValidateBytes.cs | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs index 0cfbb8afb6b847..1d4f3cfadbbdc3 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs @@ -2,7 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; -using System.Collections; +using System.Collections.Generic; using System.Linq; using System.Reflection; using System.Runtime.InteropServices; @@ -22,6 +22,10 @@ public class Utf8UtilityTests private const string EURO_SYMBOL = "E282AC"; // U+20AC EURO SIGN, 3 bytes private const string GRINNING_FACE = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes + private static readonly byte[] validTwoByteSequence = Utf8Tests.DecodeHex(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE); + private static readonly byte[] validThreeByteSequence = Utf8Tests.DecodeHex(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL); + private static readonly byte[] validFourByteSequence = Utf8Tests.DecodeHex(GRINNING_FACE + GRINNING_FACE + GRINNING_FACE + GRINNING_FACE); + [Theory] [InlineData("", 0, 0)] // empty string is OK [InlineData(X, 1, 0)] @@ -289,20 +293,20 @@ private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence) // Exercise the vectorized codepath and insert the invalid sequence at different positions. - byte[] byteVector = Utf8Tests.DecodeHex(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE); - for (int pos = 0; pos <= 16; pos++) { - ArrayList testList = new ArrayList(byteVector); + List testList = new List(validTwoByteSequence); if (pos % 2 != 0) { // Replace bytes with valid ASCII characters so they can be broken up. - testList.SetRange(pos - pos % 2, new byte[2] {0x20, 0x21}); + int replacementStart = pos - pos % 2; + testList[replacementStart] = 0x20; + testList[replacementStart + 1] = 0x21; } testList.InsertRange(pos, invalidSequence); - byte[] toTest = (byte[])testList.ToArray(typeof(byte)); + byte[] toTest = testList.ToArray(); GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos + 1) / 2, 0); } } @@ -313,20 +317,21 @@ private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence) // Exercise the vectorized codepath and insert the invalid sequence at different positions. - byte[] byteVector = Utf8Tests.DecodeHex(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL); - for (int pos = 0; pos <= 16; pos++) { - ArrayList testList = new ArrayList(byteVector); + List testList = new List(validThreeByteSequence); if (pos % 3 != 0) { // Replace bytes with valid ASCII characters so they can be broken up. - testList.SetRange(pos - pos % 3, new byte[3] {0x20, 0x21, 0x22}); + int replacementStart = pos - pos % 3; + testList[replacementStart] = 0x20; + testList[replacementStart + 1] = 0x21; + testList[replacementStart + 2] = 0x22; } testList.InsertRange(pos, invalidSequence); - byte[] toTest = (byte[])testList.ToArray(typeof(byte)); + byte[] toTest = testList.ToArray(); GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 3) + (pos % 3), 0); } } @@ -337,20 +342,22 @@ private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence) // Exercise the vectorized codepath and insert the invalid sequence at different positions. - byte[] byteVector = Utf8Tests.DecodeHex(GRINNING_FACE + GRINNING_FACE + GRINNING_FACE + GRINNING_FACE); - for (int pos = 0; pos <= 16; pos++) { - ArrayList testList = new ArrayList(byteVector); + List testList = new List(validFourByteSequence); if (pos % 4 != 0) { // Replace bytes with valid ASCII characters so they can be broken up. - testList.SetRange(pos - pos % 4, new byte[4] {0x20, 0x21, 0x22, 0x23}); + int replacementStart = pos - pos % 4; + testList[replacementStart] = 0x20; + testList[replacementStart + 1] = 0x21; + testList[replacementStart + 2] = 0x22; + testList[replacementStart + 3] = 0x23; } testList.InsertRange(pos, invalidSequence); - byte[] toTest = (byte[])testList.ToArray(typeof(byte)); + byte[] toTest = testList.ToArray(); GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 4) + (pos % 4), pos / 4); } } From 04be6a73f3670d2731f31d83a4e14b66beaf3342 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Mon, 18 May 2026 18:43:28 +0100 Subject: [PATCH 8/8] Comment out dead code --- .../src/System/Text/Unicode/Utf8Utility.Validation.cs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index cbcb5990945fb8..8f96ae2611b6d8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -963,11 +963,9 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue); - // For Arm64, use vecContinuationBytes and vecFourByteSequences to accumulate the sum for better performance. - // Otherwise, increment the adjustments directly on every iteration. - if (AdvSimd.Arm64.IsSupported) { + // For Arm64, use vecContinuationBytes and vecFourByteSequences to accumulate the sum for better performance. vecContinuationBytes += Vector128.LessThanOrEqual(currentBlock.AsSByte(), largestContinuationByte); vecFourByteSequences += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte(); overflowCounter++; @@ -990,8 +988,11 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit } else { - numContinuationBytes += Vector128.CountWhereAllBitsSet(byte2High); - numFourByteSequences += Vector128.CountWhereAllBitsSet(Vector128.SubtractSaturate(currentBlock, fourthByte)); + // Otherwise, increment the adjustments directly on every iteration. + // TODO: Support other architectures using CountWhereAllBitsSet. + // numContinuationBytes += Vector128.CountWhereAllBitsSet(byte2High); + // numFourByteSequences += Vector128.CountWhereAllBitsSet(Vector128.SubtractSaturate(currentBlock, fourthByte)); + throw new PlatformNotSupportedException(); } } }