diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index 7f020cf8e67da5..3141d2bd6dfbbf 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -720,6 +720,35 @@ License for fastmod (https://github.com/lemire/fastmod), ibm-fpgen (https://gith See the License for the specific language governing permissions and limitations under the License. +License for SimdUnicode (https://github.com/simdutf/SimdUnicode) +-------------------------------------- + + Copyright 2025 Daniel Lemire, Nick Nuon + Which is based on "Validating UTF-8 In Less Than One Instruction Per Byte" article available at https://arxiv.org/abs/2010.03090 + (c) John Keiser, Daniel Lemire + + MIT License + + Copyright (c) 2023 SimdUnicode authors + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + License for sse4-strstr (https://github.com/WojciechMula/sse4-strstr) -------------------------------------- diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 6c84d46a1ab6e0..f387e395c62f3e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1524,7 +1524,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii #if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) + internal static bool VectorContainsNonAsciiChar(Vector128 asciiVector) { // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 821037a538b3c8..8f96ae2611b6d8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -67,6 +67,13 @@ internal static unsafe partial class Utf8Utility byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds #endif +#if NET + if (inputLength >= Vector128.Count && (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) + { + return GetPointerToFirstInvalidByte_Vector128(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + } +#endif + while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar. @@ -762,6 +769,446 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); return extractedBits.AsUInt64().ToScalar(); } + + // The following algorithm based on the SimdUnicode library: + // https://github.com/simdutf/SimdUnicode + // by Daniel Lemire and Nick Nuon + // And the paper "Validating UTF-8 In Less Than One Instruction Per Byte": + // https://arxiv.org/pdf/2010.03090 + // by John Keiser and Daniel Lemire, 2021 + + [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + private static byte* GetPointerToFirstInvalidByte_Vector128(byte* pInputBuffer, int inputLength, + out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) + { + Debug.Assert(inputLength >= Vector128.Count); + + Vector128 prevInputBlock = Vector128.Zero; + + // This is used to detect whether the previous block contains incomplete sequences. + // It contains the maximum values the previous bytes can be without generating a carry. + // If we see larger values, it means we need to go back and validate. + // The first 13 bytes can never generate a carry for a valid UTF-8 byte sequence, the + // last 3 bytes are the maximum starting byte of a 3-byte, 2-byte and 1-byte sequence + // respectively. + Vector128 maxValue = Vector128.Create( + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); + Vector128 prevIncomplete = Vector128.SubtractSaturate(prevInputBlock, maxValue); + int numIncomplete = 0; // maximum number of incomplete bytes to go back + + // The error bit encoding is slightly different from the paper, instead it follows the + // SimdUnicode implementation. TOO_LARGE_1000 and OVERLONG_4 can share the same bit as + // their conditions are mutually exclusive. + const byte TOO_SHORT = 1 << 0; // Sequence is missing continuation bytes + const byte TOO_LONG = 1 << 1; // ASCII byte is followed by a continuation byte + const byte OVERLONG_3 = 1 << 2; // Character is out-of-range for a 3-byte sequence + const byte SURROGATE = 1 << 4; // Character range is reserved for UTF-16 surrogates + const byte OVERLONG_2 = 1 << 5; // Character is out-of-range for a 2-byte sequence + const byte TWO_CONTS = 1 << 7; // (Not an error) Two continuation bytes + const byte TOO_LARGE = 1 << 3; // Character is larger than the largest Unicode character + const byte TOO_LARGE_1000 = 1 << 6; // Same as TOO_LARGE, but the 2nd byte starts with 0x1000 + const byte OVERLONG_4 = 1 << 6; // Character is out-of-range for a 4-byte sequence + const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // A common case for continuation bytes + + // The 3 lookup tables used to map nibbles of consecutive bytes to possible errors in each case. + // A 4-bit nibble from the upper or lower half of a byte is used as an index (0-16) to lookup the + // corresponding error mask from the 128-bit vector. + + Vector128 tableByte1High = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + TOO_SHORT | OVERLONG_2, + TOO_SHORT, + TOO_SHORT | OVERLONG_3 | SURROGATE, + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + + Vector128 tableByte1Low = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + CARRY | OVERLONG_2, + CARRY, + CARRY, + CARRY | TOO_LARGE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + + Vector128 tableByte2High = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + + Vector128 thirdByte = Vector128.Create((byte)(0b11100000u - 0x80)); + Vector128 fourthByte = Vector128.Create((byte)(0b11110000u - 0x80)); + Vector128 vec0F = Vector128.Create((byte)0x0F); + Vector128 vec80 = Vector128.Create((byte)0x80); + Vector128 fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1)); + Vector128 largestContinuationByte = Vector128.Create((sbyte)-65); // -65 => 0b10111111 + + int numContinuationBytes = 0; // number of continuation bytes in the block + int numFourByteSequences = 0; // number of 4-byte sequences that start in this block + + // For Arm64: + // Instead of updating numFourByteSequences and numContinuationBytes continuously, we accumulate + // the values in vecFourByteSequences and vecContinuationBytes, while using overflowCounter to make + // sure we do not overflow. This allows you to reach good performance + // on systems where summing across vectors is slow. + Vector128 vecFourByteSequences = Vector128.Zero; + Vector128 vecContinuationBytes = Vector128.Zero; + int overflowCounter = 0; + + bool foundError = false; + + int processedLength = 0; + for (; processedLength <= inputLength - Vector128.Count; processedLength += Vector128.Count) + { + Vector128 currentBlock = Vector128.Load(pInputBuffer + processedLength); + if (!Ascii.VectorContainsNonAsciiChar(currentBlock)) + { + // We have an ASCII block, no need to process it, but + // we need to check if the previous block was incomplete. + if (prevIncomplete != Vector128.Zero) + { + numIncomplete = 3; + foundError = true; + break; + } + + // Often, we have a lot of ASCII characters in a row. + if (processedLength + (Vector128.Count * 2) <= inputLength) + { + byte* pAsciiRunStart = pInputBuffer + processedLength; + byte* pAsciiRunCurrent = pAsciiRunStart + Vector128.Count; // The first block is already checked + + Vector128 block = Vector128.Load(pAsciiRunCurrent); + if (!Ascii.VectorContainsNonAsciiChar(block)) + { + // If we see more ASCII characters, unroll the loop by 4. + + byte* pAsciiRunEnd = pAsciiRunStart + inputLength - processedLength - (Vector128.Count * 4); + + for (; pAsciiRunCurrent <= pAsciiRunEnd; pAsciiRunCurrent += (Vector128.Count * 4)) + { + Vector128 block1 = Vector128.Load(pAsciiRunCurrent); + Vector128 block2 = Vector128.Load(pAsciiRunCurrent + (Vector128.Count * 1)); + Vector128 block3 = Vector128.Load(pAsciiRunCurrent + (Vector128.Count * 2)); + Vector128 block4 = Vector128.Load(pAsciiRunCurrent + (Vector128.Count * 3)); + if (Ascii.VectorContainsNonAsciiChar(block1 | block2 | block3 | block4)) + { + break; + } + } + } + + processedLength += (int)(pAsciiRunCurrent - pAsciiRunStart) - Vector128.Count; + } + } + else + { + // Contains non-ASCII characters, we need to do non-trivial processing + + Vector128 prev1; + Vector128 prev2; + Vector128 prev3; + Vector128 byte1High; + Vector128 byte1Low; + Vector128 byte2High; + + // TODO: Support for other architectures can be added here. + + if (AdvSimd.Arm64.IsSupported) + { + prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1)); + byte1High = AdvSimd.Arm64.VectorTableLookup(tableByte1High, prev1 >>> 4); + byte1Low = AdvSimd.Arm64.VectorTableLookup(tableByte1Low, (prev1 & vec0F)); + byte2High = AdvSimd.Arm64.VectorTableLookup(tableByte2High, currentBlock >>> 4); + prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2)); + prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3)); + } + else + { + throw new PlatformNotSupportedException(); + } + + prevInputBlock = currentBlock; + + // Find invalid 2-byte sequences by matching the error bits from the table lookups. + Vector128 twoBytesError = byte1High & byte1Low & byte2High; + + // Check if the sequences with two continuation bytes are valid. + // This is only possible for 3 or 4-byte sequences, then we match the expected occurrences + // against the MSB from the table lookup results. + Vector128 isThirdByte = Vector128.SubtractSaturate(prev2, thirdByte); + Vector128 isFourthByte = Vector128.SubtractSaturate(prev3, fourthByte); + Vector128 twoContinuationBytes = (isThirdByte | isFourthByte) & vec80; // Extract the MSB + Vector128 error = twoContinuationBytes ^ twoBytesError; + + if (error != Vector128.Zero) + { + // Error is found if the error bit mask is non-zero. + numIncomplete = (processedLength == 0) ? 0 : 3; + foundError = true; + break; + } + + prevIncomplete = Vector128.SubtractSaturate(currentBlock, maxValue); + + if (AdvSimd.Arm64.IsSupported) + { + // For Arm64, use vecContinuationBytes and vecFourByteSequences to accumulate the sum for better performance. + vecContinuationBytes += Vector128.LessThanOrEqual(currentBlock.AsSByte(), largestContinuationByte); + vecFourByteSequences += Vector128.GreaterThan(currentBlock, fourthByteMinusOne).AsSByte(); + overflowCounter++; + // We have a risk of overflow if overflowCounter reaches 127, + // in which case, we empty vecContinuationBytes and vecFourByteSequences, and update numContinuationBytes and + // numFourByteSequences. + if (overflowCounter == 0x7f) + { + overflowCounter = 0; + + // The vector results are negative, so subtract to make the scalar positive. + numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar(); + vecContinuationBytes = Vector128.Zero; + if (vecFourByteSequences != Vector128.Zero) + { + numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar(); + vecFourByteSequences = Vector128.Zero; + } + } + } + else + { + // Otherwise, increment the adjustments directly on every iteration. + // TODO: Support other architectures using CountWhereAllBitsSet. + // numContinuationBytes += Vector128.CountWhereAllBitsSet(byte2High); + // numFourByteSequences += Vector128.CountWhereAllBitsSet(Vector128.SubtractSaturate(currentBlock, fourthByte)); + throw new PlatformNotSupportedException(); + } + } + } + + // Sum up the remaining values from vecContinuationBytes and vecFourByteSequences. + if (AdvSimd.Arm64.IsSupported) + { + numContinuationBytes -= AdvSimd.Arm64.AddAcrossWidening(vecContinuationBytes).ToScalar(); + if (vecFourByteSequences != Vector128.Zero) + { + numFourByteSequences -= AdvSimd.Arm64.AddAcrossWidening(vecFourByteSequences).ToScalar(); + } + } + else + { + // Do nothing since numContinuationBytes and numFourByteSequences were incremented directly. + } + + if (!foundError) + { + bool hasIncomplete = prevIncomplete != Vector128.Zero; + if (processedLength == inputLength && !hasIncomplete) + { + // No invalid byte is found across the whole input length. + + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes); + return pInputBuffer + inputLength; + } + + // Still has incomplete data to validate. + numIncomplete = hasIncomplete ? 3 : 0; + } + + // Find the first invalid byte, going back if necessary. + // Then, adjust the counters 'numFourByteSequences' and 'numContinuationBytes', since we might be + // overcounting or undercounting them during processing. + + ReadOnlySpan inputSpan = new ReadOnlySpan(pInputBuffer, inputLength); + + int invalidIndex = SimpleRewindAndValidateWithErrors(numIncomplete, inputSpan, processedLength - numIncomplete); + byte* invalidBytePointer = pInputBuffer + invalidIndex; + + AdjustCounters(inputSpan, processedLength, invalidIndex, ref numFourByteSequences, ref numContinuationBytes); + + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(numFourByteSequences, numContinuationBytes); + return invalidBytePointer; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int SimpleRewindAndValidateWithErrors(int howFarBack, ReadOnlySpan buffer, int startIndex) + { + // We scan from startIndex forward, possibly going back howFarBack bytes, to find the end of + // a valid UTF-8 sequence. We return buffer.Length if the buffer is valid, otherwise we return + // the index of the first invalid byte + + bool foundLeadingBytes = false; + + for (int i = 0; i <= howFarBack; i++) + { + byte candidateByte = buffer[startIndex - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + + if (foundLeadingBytes) + { + startIndex -= i; + break; + } + } + + if (!foundLeadingBytes) + { + return startIndex - howFarBack; + } + + int idx = startIndex; + while (idx < buffer.Length) + { + byte firstByte = buffer[idx]; + + while (firstByte < 0b10000000) + { + if (++idx == buffer.Length) + { + return buffer.Length; + } + firstByte = buffer[idx]; + } + + if ((firstByte & 0b11100000) == 0b11000000) + { + int nextIdx = idx + 2; + if (nextIdx > buffer.Length) + { + return idx; + } // Too short + if ((buffer[idx + 1] & 0b11000000) != 0b10000000) + { + return idx; + } // Too short + // range check + uint codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buffer[idx + 1] & 0b00111111); + if ((codePoint < 0x80) || (0x7ff < codePoint)) + { + return idx; + } // Overlong + idx = nextIdx; + } + else if ((firstByte & 0b11110000) == 0b11100000) + { + int nextIdx = idx + 3; + if (nextIdx > buffer.Length) + { + return idx; + } // Too short + // range check + uint codePoint = (uint)(firstByte & 0b00001111) << 12 | + (uint)(buffer[idx + 1] & 0b00111111) << 6 | + (uint)(buffer[idx + 2] & 0b00111111); + // Either overlong or too large: + if ((codePoint < 0x800) || (0xffff < codePoint) || + (0xd7ff < codePoint && codePoint < 0xe000)) + { + return idx; + } + if ((buffer[idx + 1] & 0b11000000) != 0b10000000) + { + return idx; + } // Too short + if ((buffer[idx + 2] & 0b11000000) != 0b10000000) + { + return idx; + } // Too short + idx = nextIdx; + } + else if ((firstByte & 0b11111000) == 0b11110000) + { + int nextIdx = idx + 4; + if (nextIdx > buffer.Length) + { + return idx; + } + if ((buffer[idx + 1] & 0b11000000) != 0b10000000) + { + return idx; + } + if ((buffer[idx + 2] & 0b11000000) != 0b10000000) + { + return idx; + } + if ((buffer[idx + 3] & 0b11000000) != 0b10000000) + { + return idx; + } + // range check + uint codePoint = + (uint)(firstByte & 0b00000111) << 18 | (uint)(buffer[idx + 1] & 0b00111111) << 12 | + (uint)(buffer[idx + 2] & 0b00111111) << 6 | (uint)(buffer[idx + 3] & 0b00111111); + if (codePoint <= 0xffff || 0x10ffff < codePoint) + { + return idx; + } + idx = nextIdx; + } + else + { + // we may have a continuation/too long error + return idx; + } + } + + return buffer.Length; // no error + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void AdjustCounters(ReadOnlySpan buffer, int processedIndex, int invalidIndex, ref int numFourByteSequences, ref int numContinuationBytes) + { + if (invalidIndex < processedIndex) + { + for (int i = invalidIndex; i < processedIndex; i++) + { + if ((buffer[i] & 0b11000000) == 0b10000000) + { + numContinuationBytes -= 1; + } + if ((buffer[i] & 0b11110000) == 0b11110000) + { + numFourByteSequences -= 1; + } + } + } + else + { + for (int i = processedIndex; i < invalidIndex; i++) + { + if ((buffer[i] & 0b11000000) == 0b10000000) + { + numContinuationBytes += 1; + } + if ((buffer[i] & 0b11110000) == 0b11110000) + { + numFourByteSequences += 1; + } + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int numFourByteSequences, int numContinuationBytes) + { + int n3 = -2 * numFourByteSequences + 2 * numContinuationBytes; + int n2 = numFourByteSequences - 3 * numContinuationBytes; + int utfadjust = -2 * numFourByteSequences - 2 * n3 - n2; + int scalaradjust = -numFourByteSequences; + + return (utfadjust, scalaradjust); + } #endif } } diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs index 4730337b0878ed..1d4f3cfadbbdc3 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Collections.Generic; using System.Linq; using System.Reflection; using System.Runtime.InteropServices; @@ -21,6 +22,10 @@ public class Utf8UtilityTests private const string EURO_SYMBOL = "E282AC"; // U+20AC EURO SIGN, 3 bytes private const string GRINNING_FACE = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes + private static readonly byte[] validTwoByteSequence = Utf8Tests.DecodeHex(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE); + private static readonly byte[] validThreeByteSequence = Utf8Tests.DecodeHex(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL); + private static readonly byte[] validFourByteSequence = Utf8Tests.DecodeHex(GRINNING_FACE + GRINNING_FACE + GRINNING_FACE + GRINNING_FACE); + [Theory] [InlineData("", 0, 0)] // empty string is OK [InlineData(X, 1, 0)] @@ -241,6 +246,13 @@ public void GetIndexOfFirstInvalidUtf8Sequence_WithOutOfRangeFourByteSequences_R { AssertIsInvalidFourByteSequence(new byte[] { 0xF4, (byte)i, 0x80, 0x80 }); } + + // [ F5..FF ] [ 80..BF ] [ 80..BF ] [ 80..BF ] is out-of-range 4-byte sequence + + for (int i = 0xF5; i < 0x100; i++) + { + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x80, 0x80, 0x80 }); + } } [Fact] @@ -279,57 +291,75 @@ private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence) { Assert.Equal(2, invalidSequence.Length); - byte[] knownGoodBytes = Utf8Tests.DecodeHex(E_ACUTE); - - byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); + // Exercise the vectorized codepath and insert the invalid sequence at different positions. - toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of first DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 2, 1, 0); - - // Run the same tests but with extra data at the beginning so that we're inside one of - // the 2-byte processing "hot loop" code paths. + for (int pos = 0; pos <= 16; pos++) + { + List testList = new List(validTwoByteSequence); - toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of next DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 2, 0); + if (pos % 2 != 0) + { + // Replace bytes with valid ASCII characters so they can be broken up. + int replacementStart = pos - pos % 2; + testList[replacementStart] = 0x20; + testList[replacementStart + 1] = 0x21; + } - toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of next DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 3, 0); + testList.InsertRange(pos, invalidSequence); + byte[] toTest = testList.ToArray(); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos + 1) / 2, 0); + } } private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence) { Assert.Equal(3, invalidSequence.Length); - byte[] knownGoodBytes = Utf8Tests.DecodeHex(EURO_SYMBOL); + // Exercise the vectorized codepath and insert the invalid sequence at different positions. - byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); - - // Run the same tests but with extra data at the beginning so that we're inside one of - // the 3-byte processing "hot loop" code paths. - - toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling first and second DWORDs - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 3, 1, 0); + for (int pos = 0; pos <= 16; pos++) + { + List testList = new List(validThreeByteSequence); - toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling second and third DWORDs - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 2, 0); + if (pos % 3 != 0) + { + // Replace bytes with valid ASCII characters so they can be broken up. + int replacementStart = pos - pos % 3; + testList[replacementStart] = 0x20; + testList[replacementStart + 1] = 0x21; + testList[replacementStart + 2] = 0x22; + } - toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of third DWORD - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 9, 3, 0); + testList.InsertRange(pos, invalidSequence); + byte[] toTest = testList.ToArray(); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 3) + (pos % 3), 0); + } } private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence) { Assert.Equal(4, invalidSequence.Length); - byte[] knownGoodBytes = Utf8Tests.DecodeHex(GRINNING_FACE); + // Exercise the vectorized codepath and insert the invalid sequence at different positions. - byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); + for (int pos = 0; pos <= 16; pos++) + { + List testList = new List(validFourByteSequence); - toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); - GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 1, 1); + if (pos % 4 != 0) + { + // Replace bytes with valid ASCII characters so they can be broken up. + int replacementStart = pos - pos % 4; + testList[replacementStart] = 0x20; + testList[replacementStart + 1] = 0x21; + testList[replacementStart + 2] = 0x22; + testList[replacementStart + 3] = 0x23; + } + + testList.InsertRange(pos, invalidSequence); + byte[] toTest = testList.ToArray(); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, pos, (pos / 4) + (pos % 4), pos / 4); + } } private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(string inputHex, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)