From c69998277eb6662b6fb80ffa864de99aed75eda3 Mon Sep 17 00:00:00 2001 From: lewing Date: Wed, 24 Jun 2026 23:53:05 -0500 Subject: [PATCH 01/11] Add Wasm PackedSimd path to Vector128.UnpackLow/UnpackHigh These internal helpers are used by HexConverter.AsciiToHexVector128 and other byte-interleaving code paths. They previously dispatched only to Sse2.UnpackLow/UnpackHigh or AdvSimd.Arm64.ZipLow/ZipHigh and threw NotSupportedException on platforms without either ISA. With the recent change that enables HexConverter and Guid format on Wasm via PackedSimd, the helpers became reachable on browser-wasm and started throwing at runtime in libraries tests. Lower to PackedSimd.Shuffle with a constant 16-byte index vector (i8x16.shuffle) when PackedSimd is supported. Validated via System.Runtime.Extensions.Tests on browser-wasm (8224 passing, 0 failed) after the previous run failed 132 tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../System/Runtime/Intrinsics/Vector128.cs | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 16f46daddd7eab..987c3c3d30671f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -4388,33 +4388,47 @@ internal static void SetElementUnsafe(in this Vector128 vector, int index, [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] [CompExactlyDependsOn(typeof(Sse2))] + [CompExactlyDependsOn(typeof(PackedSimd))] internal static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) { return Sse2.UnpackLow(left, right); } - else if (!AdvSimd.Arm64.IsSupported) + else if (AdvSimd.Arm64.IsSupported) { - ThrowHelper.ThrowNotSupportedException(); + return AdvSimd.Arm64.ZipLow(left, right); + } + else if (PackedSimd.IsSupported) + { + return PackedSimd.Shuffle(left, right, + Vector128.Create((byte)0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)); } - return AdvSimd.Arm64.ZipLow(left, right); + ThrowHelper.ThrowNotSupportedException(); + return default; } [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] [CompExactlyDependsOn(typeof(Sse2))] + [CompExactlyDependsOn(typeof(PackedSimd))] internal static Vector128 UnpackHigh(Vector128 left, Vector128 right) { if (Sse2.IsSupported) { return Sse2.UnpackHigh(left, right); } - else if (!AdvSimd.Arm64.IsSupported) + else if (AdvSimd.Arm64.IsSupported) { - ThrowHelper.ThrowNotSupportedException(); + return AdvSimd.Arm64.ZipHigh(left, right); + } + else if (PackedSimd.IsSupported) + { + return PackedSimd.Shuffle(left, right, + Vector128.Create((byte)8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)); } - return AdvSimd.Arm64.ZipHigh(left, right); + ThrowHelper.ThrowNotSupportedException(); + return default; } } } From 3dc956b6465a6d8b2d36cf0be10c894048002c45 Mon Sep 17 00:00:00 2001 From: lewing Date: Wed, 24 Jun 2026 23:34:53 -0500 Subject: [PATCH 02/11] Enable Vector128 hex/Guid format fast path on Wasm HexConverter.AsciiToHexVector128, HexConverter.EncodeTo_Vector128 and Guid.FormatGuidVector128Utf8 already use only portable Vector128 ops (Vector128.ShuffleNative, Vector128.UnpackLow/High, Vector128.Shuffle with constant indices) plus an optional AdvSimd.Arm64-specific branch. The gates at Convert.ToHexString, EncodeToUtf8/Utf16, and Guid.ToString required Ssse3 or AdvSimd.Arm64, so Wasm fell back to scalar even with PackedSimd. Add PackedSimd.IsSupported to the gates and the [CompExactlyDependsOn] attributes on the helpers. The bodies are unchanged; on Wasm the existing else branch (portable Vector128.Shuffle) is selected. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/libraries/Common/src/System/HexConverter.cs | 8 +++++--- src/libraries/System.Private.CoreLib/src/System/Guid.cs | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/libraries/Common/src/System/HexConverter.cs b/src/libraries/Common/src/System/HexConverter.cs index 10316e0c610325..f0a051393f6660 100644 --- a/src/libraries/Common/src/System/HexConverter.cs +++ b/src/libraries/Common/src/System/HexConverter.cs @@ -96,9 +96,10 @@ public static void ToCharsBuffer(byte value, Span buffer, int startingInde [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] internal static (Vector128, Vector128) AsciiToHexVector128(Vector128 src, Vector128 hexMap) { - Debug.Assert(Ssse3.IsSupported || AdvSimd.Arm64.IsSupported); + Debug.Assert(Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported); // The algorithm is simple: a single srcVec (contains the whole 16b Guid) is converted // into nibbles and then, via hexMap, converted into a HEX representation via @@ -115,6 +116,7 @@ internal static (Vector128, Vector128) AsciiToHexVector128(Vector128 [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static void EncodeTo_Vector128(ReadOnlySpan source, Span destination, Casing casing) { Debug.Assert(source.Length >= (Vector128.Count / 2)); @@ -187,7 +189,7 @@ public static void EncodeToUtf8(ReadOnlySpan source, Span utf8Destin Debug.Assert(utf8Destination.Length >= (source.Length * 2)); #if SYSTEM_PRIVATE_CORELIB - if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported) && (source.Length >= (Vector128.Count / 2))) + if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported || PackedSimd.IsSupported) && (source.Length >= (Vector128.Count / 2))) { EncodeTo_Vector128(source, utf8Destination, casing); return; @@ -204,7 +206,7 @@ public static void EncodeToUtf16(ReadOnlySpan source, Span destinati Debug.Assert(destination.Length >= (source.Length * 2)); #if SYSTEM_PRIVATE_CORELIB - if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported) && (source.Length >= (Vector128.Count / 2))) + if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported || PackedSimd.IsSupported) && (source.Length >= (Vector128.Count / 2))) { EncodeTo_Vector128(source, Unsafe.BitCast, Span>(destination), casing); return; diff --git a/src/libraries/System.Private.CoreLib/src/System/Guid.cs b/src/libraries/System.Private.CoreLib/src/System/Guid.cs index 60b6b289991ca3..1ac2e1ccf24451 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Guid.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Guid.cs @@ -10,6 +10,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; using System.Runtime.Versioning; using System.Text; @@ -1345,7 +1346,7 @@ internal unsafe bool TryFormatCore(Span destination, out int chars } flags >>= 8; - if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian) + if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && BitConverter.IsLittleEndian) { // Vectorized implementation for D, N, P and B formats: // [{|(]dddddddd[-]dddd[-]dddd[-]dddd[-]dddddddddddd[}|)] @@ -1513,9 +1514,10 @@ static void WriteHex(Span dest, int offset, int val, bool appendComma = t [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static (Vector128, Vector128, Vector128) FormatGuidVector128Utf8(Guid value, bool useDashes) { - Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian); + Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && BitConverter.IsLittleEndian); // Vectorized implementation for D, N, P and B formats: // [{|(]dddddddd[-]dddd[-]dddd[-]dddd[-]dddddddddddd[}|)] From 061b22bffcf212154b7d75a30ba8d15dc91ce22a Mon Sep 17 00:00:00 2001 From: lewing Date: Wed, 24 Jun 2026 23:35:37 -0500 Subject: [PATCH 03/11] Vectorize Utf8Utility.Validation ASCII fast path on Wasm GetPointerToFirstInvalidChar's inner ASCII-scan loop dispatched on AdvSimd.Arm64 (with bitmask128) or Sse2 (with MoveMask), falling back to a scalar 4-DWORD-at-a-time path otherwise. On Wasm with PackedSimd, neither SIMD branch was taken, so UTF-8 validation took the scalar path. Add a PackedSimd.IsSupported branch that uses portable Vector128.LoadUnsafe + ExtractMostSignificantBits to compute the same per-byte non-ASCII bitmask used by the Sse2 path. Update the post-loop Debug.Assert to include PackedSimd. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../System/Text/Unicode/Utf8Utility.Validation.cs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 0717ed348af4ed..5388c6c68d0052 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -9,6 +9,7 @@ #if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; #endif @@ -158,6 +159,15 @@ internal static unsafe partial class Utf8Utility goto LoopTerminatedEarlyDueToNonAsciiData; } } + else if (PackedSimd.IsSupported) + { + uint mask = Vector128.LoadUnsafe(ref *pInputBuffer).ExtractMostSignificantBits(); + if (mask != 0) + { + trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask); + goto LoopTerminatedEarlyDueToNonAsciiData; + } + } else #endif { @@ -182,7 +192,7 @@ internal static unsafe partial class Utf8Utility LoopTerminatedEarlyDueToNonAsciiData: // x86 can only be little endian, while ARM can be big or little endian // so if we reached this label we need to check both combinations are supported - Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported); + Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported || PackedSimd.IsSupported); // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit From da08d905c6e922246bddec1b1f11ca08be530928 Mon Sep 17 00:00:00 2001 From: lewing Date: Thu, 25 Jun 2026 00:07:33 -0500 Subject: [PATCH 04/11] Vectorize Utf8Utility.Transcoding ASCII fast path on Wasm TranscodeToUtf8's 8-char ASCII fast loop used a Vector128 read, a mask-and-compare to detect non-ASCII, and a narrow-and-store of 8 bytes using Sse2.PackUnsignedSaturate / AdvSimd.ExtractNarrowingSatura teUnsignedLower. Two follow-on 4-char sites narrowed 4 bytes the same way. All four sites required Sse41.X64 or AdvSimd.Arm64 + LE, so Wasm took the 4-DWORD-at-a-time scalar fallback. Add PackedSimd branches at every dispatch site: - Outer entry gate (declaration + entry condition) - 8-char narrow-store: use the existing portable AND-compare for the non-ASCII test (same code Sse41 already uses) and PackedSimd.Convert NarrowingSaturateUnsigned + scalar extract for the store - 4-char narrow-stores: PackedSimd.ConvertNarrowingSaturateUnsigned + AsUInt32().ToScalar() unaligned write The Sse2.X64.ConvertToUInt64 sub-branch already had an else path that calls AsUInt64().ToScalar(), which works on Wasm without changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Text/Unicode/Utf8Utility.Transcoding.cs | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs index ff095110653805..7931269a0e5c60 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -10,6 +10,7 @@ #if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; #endif @@ -884,7 +885,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt #if NET Vector128 nonAsciiUtf16DataMask; - if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) + if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || (PackedSimd.IsSupported && BitConverter.IsLittleEndian)) { nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char } @@ -944,7 +945,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); #if NET - if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) + if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || (PackedSimd.IsSupported && BitConverter.IsLittleEndian)) { // Try reading and writing 8 elements per iteration. uint maxIters = minElementsRemaining / 8; @@ -982,6 +983,17 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt // narrow and write Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64()); } + else if (PackedSimd.IsSupported) + { + if ((utf16Data & nonAsciiUtf16DataMask) != Vector128.Zero) + { + goto LoopTerminatedDueToNonAsciiDataInVectorLocal; + } + + // narrow and write low 8 bytes + Vector128 narrowed = PackedSimd.ConvertNarrowingSaturateUnsigned(utf16Data, utf16Data); + Unsafe.WriteUnaligned(pOutputBuffer, narrowed.AsUInt64().ToScalar()); + } else { // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead @@ -1015,6 +1027,11 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt { Unsafe.WriteUnaligned(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); } + else if (PackedSimd.IsSupported) + { + Vector128 narrowed = PackedSimd.ConvertNarrowingSaturateUnsigned(utf16Data, utf16Data); + Unsafe.WriteUnaligned(pOutputBuffer, narrowed.AsUInt32().ToScalar()); + } else { // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead @@ -1058,6 +1075,11 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt { Unsafe.WriteUnaligned(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); } + else if (PackedSimd.IsSupported) + { + Vector128 narrowed = PackedSimd.ConvertNarrowingSaturateUnsigned(utf16Data, utf16Data); + Unsafe.WriteUnaligned(pOutputBuffer, narrowed.AsUInt32().ToScalar()); + } else { // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead From 2caa5e259971daa85d4c9f8f3604baf43b2da1ab Mon Sep 17 00:00:00 2001 From: lewing Date: Thu, 25 Jun 2026 00:22:24 -0500 Subject: [PATCH 05/11] Enable Teddy multi-string search on Wasm SearchValues with 2+ values previously selected the Aho-Corasick implementation on Wasm because the Teddy entry gate in StringSearchValues.cs required Ssse3 or AdvSimd.Arm64. Teddy's core Vector128 primitives in TeddyHelper.cs (LoadAndPack16AsciiChars, the nibble GetNibbles helper, the two-table Shuffle, and RightShift1/2) similarly excluded PackedSimd. Add PackedSimd branches throughout: - LoadAndPack16AsciiChars: PackedSimd.ConvertNarrowingSaturateUnsigned - GetNibbles: PackedSimd needs the explicit '& 0xF' on the low half because Swizzle returns 0 for indices >= 16 (unlike Ssse3's implicit AND of the low 4 bits) - Shuffle: already uses portable Vector128.ShuffleNative which maps to PackedSimd.Swizzle; just widen the [CompExactlyDependsOn] - RightShift1/RightShift2: compose two Vector128.ShuffleNative calls with constant index vectors and OR the halves. PackedSimd.Shuffle (two-vector i8x16.shuffle) is impractical due to constant lane index requirements; Swizzle clamps out-of-range to 0 which makes the OR safe. Widen the entry gate in StringSearchValues.cs.CreateFromNormalizedV alues and the null-char filter in TryGetTeddyAcceleratedValues (PackedSimd shares Ssse3's PackUnsignedSaturate behavior where signed negative inputs become 0, so null-containing needles produce more false positives on both). Widen [CompExactlyDependsOn] on the IndexOfAnyN2/N3 + Vector128 helpers in AsciiStringSearchValuesTeddyBase.cs. Validated: System.Memory.Tests on browser-wasm 52249/52249 passing (covers SearchValues Teddy paths via StringSearchValues tests), host 52905/52906 unchanged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../AsciiStringSearchValuesTeddyBase.cs | 5 +++ .../Strings/Helpers/TeddyHelper.cs | 34 ++++++++++++++++++- .../Strings/StringSearchValues.cs | 10 +++--- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs index 0869ad1553f895..2d96c3009f9fdc 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs @@ -8,6 +8,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; using static System.Buffers.StringSearchValuesHelper; using static System.Buffers.TeddyHelper; @@ -150,6 +151,7 @@ protected AsciiStringSearchValuesTeddyBase(string[][] buckets, ReadOnlySpan span) { // The behavior of the rest of the function remains the same if Avx2 or Avx512BW aren't supported @@ -170,6 +172,7 @@ protected int IndexOfAnyN2(ReadOnlySpan span) [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] protected int IndexOfAnyN3(ReadOnlySpan span) { // The behavior of the rest of the function remains the same if Avx2 or Avx512BW aren't supported @@ -190,6 +193,7 @@ protected int IndexOfAnyN3(ReadOnlySpan span) [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] private int IndexOfAnyN2Vector128(ReadOnlySpan span) { // See comments in 'IndexOfAnyN3Vector128' below. @@ -350,6 +354,7 @@ private int IndexOfAnyN2Avx512(ReadOnlySpan span) [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] private int IndexOfAnyN3Vector128(ReadOnlySpan span) { // We can't process inputs shorter than 18 characters in a vectorized manner here. diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs index 064838c98cf361..7fae7b0fd35c50 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs @@ -4,6 +4,7 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; namespace System.Buffers @@ -17,6 +18,7 @@ internal static class TeddyHelper [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] public static (Vector128 Result, Vector128 Prev0) ProcessInputN2( Vector128 input, Vector128 prev0, @@ -90,6 +92,7 @@ public static (Vector512 Result, Vector512 Prev0) ProcessInputN2( [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] public static (Vector128 Result, Vector128 Prev0, Vector128 Prev1) ProcessInputN3( Vector128 input, Vector128 prev0, Vector128 prev1, @@ -216,6 +219,7 @@ public static (Vector512 Result, Vector512 Prev0, Vector512 Pr [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Sse2))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] public static Vector128 LoadAndPack16AsciiChars(ref char source) { Vector128 source0 = Vector128.LoadUnsafe(ref source); @@ -229,6 +233,10 @@ public static Vector128 LoadAndPack16AsciiChars(ref char source) { return AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte()); } + else if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateUnsigned(source0.AsInt16(), source1.AsInt16()); + } else { // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead @@ -276,10 +284,13 @@ public static Vector512 LoadAndPack64AsciiChars(ref char source) [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static (Vector128 Low, Vector128 High) GetNibbles(Vector128 input) { // 'low' is not strictly correct here, but we take advantage of Ssse3.Shuffle's behavior - // of doing an implicit 'AND 0xF' in order to skip the redundant AND. + // of doing an implicit 'AND 0xF' in order to skip the redundant AND. PackedSimd.Swizzle + // and AdvSimd's table lookup return 0 for indices >= 16 (instead of masking the low 4 + // bits), so they need the explicit AND. Vector128 low = Ssse3.IsSupported ? input : input & Vector128.Create((byte)0xF); @@ -316,6 +327,7 @@ private static (Vector512 Low, Vector512 High) GetNibbles(Vector512< [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static Vector128 Shuffle(Vector128 maskLow, Vector128 maskHigh, Vector128 low, Vector128 high) { return SearchValues.ShuffleNativeModified(maskLow, low) & Vector128.ShuffleNative(maskHigh, high); @@ -338,6 +350,7 @@ private static Vector512 Shuffle(Vector512 maskLow, Vector512 [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static Vector128 RightShift1(Vector128 left, Vector128 right) { // Given input vectors like @@ -354,6 +367,16 @@ private static Vector128 RightShift1(Vector128 left, Vector128 { return AdvSimd.ExtractVector128(left, right, 15); } + else if (PackedSimd.IsSupported) + { + // ShuffleNative lowers to PackedSimd.Swizzle which clamps out-of-range + // indices (>= 16) to 0, so we can compose the two halves with an OR. + Vector128 leftPart = Vector128.ShuffleNative(left, + Vector128.Create((byte)15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)); + Vector128 rightPart = Vector128.ShuffleNative(right, + Vector128.Create((byte)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)); + return leftPart | rightPart; + } else { // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead @@ -365,6 +388,7 @@ private static Vector128 RightShift1(Vector128 left, Vector128 [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static Vector128 RightShift2(Vector128 left, Vector128 right) { // Given input vectors like @@ -381,6 +405,14 @@ private static Vector128 RightShift2(Vector128 left, Vector128 { return AdvSimd.ExtractVector128(left, right, 14); } + else if (PackedSimd.IsSupported) + { + Vector128 leftPart = Vector128.ShuffleNative(left, + Vector128.Create((byte)14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)); + Vector128 rightPart = Vector128.ShuffleNative(right, + Vector128.Create((byte)0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)); + return leftPart | rightPart; + } else { // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs index a9ac9452c215cb..6c95f892366bd6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs @@ -7,6 +7,7 @@ using System.Globalization; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; using System.Text; using System.Text.Unicode; @@ -128,7 +129,7 @@ private static SearchValues CreateFromNormalizedValues( return CreateForSingleValue(values[0], uniqueValues, ignoreCase, allAscii, asciiLettersOnly); } - if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && + if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && TryGetTeddyAcceleratedValues(values, uniqueValues, ignoreCase, allAscii, asciiLettersOnly, nonAsciiAffectedByCaseConversion, minLength) is { } searchValues) { return searchValues; @@ -198,7 +199,7 @@ static SearchValues PickAhoCorasickImplementation(AhoC int n = minLength == 2 ? 2 : 3; - if (Ssse3.IsSupported) + if (Ssse3.IsSupported || PackedSimd.IsSupported) { foreach (string value in values) { @@ -206,8 +207,9 @@ static SearchValues PickAhoCorasickImplementation(AhoC { // If we let null chars through here, Teddy would still work correctly, but it // would hit more false positives that the verification step would have to rule out. - // While we could flow a generic flag like Ssse3AndWasmHandleZeroInNeedle through, - // we expect such values to be rare enough that introducing more code is not worth it. + // Ssse3.PackUnsignedSaturate and PackedSimd.ConvertNarrowingSaturateUnsigned both + // treat negative signed-16 values as 0, so we filter out null-containing needles + // for both to avoid that source of false positives. return null; } } From 819c9fe3960296005519f7f74797a38c71269a64 Mon Sep 17 00:00:00 2001 From: lewing Date: Thu, 25 Jun 2026 00:32:05 -0500 Subject: [PATCH 06/11] Enable ProbabilisticMap vectorized SearchValues on Wasm SearchValues with values that span more than the ASCII range selects a ProbabilisticMap-based search. The vectorized IndexOfAny / LastIndexOfAny path (using ContainsMask16Chars + IsCharBitNotSet) was previously gated on Sse41 || AdvSimd.Arm64 only, so on Wasm the search fell back to the scalar SimpleLoop even when PackedSimd was available. This change is subtler than the other enablement PRs because the *layout* of the ProbabilisticMap bitmap also branches on the same gate (SetCharBit/IsCharBitSet at the top of the file). The [BypassReadyToRun] comment there warns that the construction and lookup branches must agree at all times during program execution. Widen all three gates (SetCharBit/IsCharBitSet, ContainsMask16Chars, the IndexOfAny/LastIndexOfAny entry dispatcher, and the [CompExactly DependsOn] on the Vector128 worker methods) to include PackedSimd consistently. ContainsMask16Chars gets a PackedSimd branch that mirrors the Sse2 algorithm using PackedSimd.ConvertNarrowingSaturateUnsigned for the two-vector narrowing step. IsCharBitNotSet already had a PackedSimd dependency for the table lookup via Vector128.ShuffleNative. ProbabilisticWithAsciiCharSearchValues already had PackedSimd dispatch. Validated: System.Memory.Tests on browser-wasm 52249/52249 passing, host arm64 52905/52906 unchanged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../System/SearchValues/ProbabilisticMap.cs | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs index 0b09377b503e17..7a895c1a7d3b08 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs @@ -80,7 +80,7 @@ public ProbabilisticMap(ReadOnlySpan values) [BypassReadyToRun] private static void SetCharBit(ref uint charMap, byte value) { - if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported) + if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) { Unsafe.Add(ref Unsafe.As(ref charMap), value & VectorizedIndexMask) |= (byte)(1u << (value >> VectorizedIndexShift)); } @@ -92,7 +92,7 @@ private static void SetCharBit(ref uint charMap, byte value) [MethodImpl(MethodImplOptions.AggressiveInlining)] [BypassReadyToRun] - private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported + private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported ? (Unsafe.Add(ref Unsafe.As(ref charMap), value & VectorizedIndexMask) & (1u << (value >> VectorizedIndexShift))) != 0 : (Unsafe.Add(ref charMap, value & PortableIndexMask) & (1u << (value >> PortableIndexShift))) != 0; @@ -220,6 +220,7 @@ private static Vector256 IsCharBitNotSetAvx2(Vector256 charMapLower, [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] [CompExactlyDependsOn(typeof(Sse2))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static Vector128 ContainsMask16Chars(Vector128 charMapLower, Vector128 charMapUpper, ref char searchSpace) { Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); @@ -238,6 +239,11 @@ private static Vector128 ContainsMask16Chars(Vector128 charMapLower, sourceLower = AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte()); sourceUpper = AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte()); } + else if (PackedSimd.IsSupported) + { + sourceLower = PackedSimd.ConvertNarrowingSaturateUnsigned((source0 & Vector128.Create((ushort)255)).AsInt16(), (source1 & Vector128.Create((ushort)255)).AsInt16()); + sourceUpper = PackedSimd.ConvertNarrowingSaturateUnsigned((source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16()); + } else { // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead @@ -392,7 +398,7 @@ private static unsafe int ProbabilisticLastIndexOfAny(ref char searchSpace, int internal static int IndexOfAny(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) where TUseFastContains : struct, SearchValues.IRuntimeConst { - if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && searchSpaceLength >= 16) + if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && searchSpaceLength >= 16) { return Vector512.IsHardwareAccelerated && Avx512Vbmi.VL.IsSupported ? IndexOfAnyVectorizedAvx512(ref searchSpace, searchSpaceLength, ref state) @@ -406,7 +412,7 @@ internal static int IndexOfAny(ref char searchSpace, int searc internal static int LastIndexOfAny(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) where TUseFastContains : struct, SearchValues.IRuntimeConst { - if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && searchSpaceLength >= 16) + if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && searchSpaceLength >= 16) { return Vector512.IsHardwareAccelerated && Avx512Vbmi.VL.IsSupported ? LastIndexOfAnyVectorizedAvx512(ref searchSpace, searchSpaceLength, ref state) @@ -501,10 +507,11 @@ private static int IndexOfAnyVectorizedAvx512(ref char searchS [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] [CompExactlyDependsOn(typeof(Sse41))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static int IndexOfAnyVectorized(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) where TUseFastContains : struct, SearchValues.IRuntimeConst { - Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported); + Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported); Debug.Assert(searchSpaceLength >= 16); ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); @@ -679,10 +686,11 @@ private static int LastIndexOfAnyVectorizedAvx512(ref char sea [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] [CompExactlyDependsOn(typeof(Sse41))] + [CompExactlyDependsOn(typeof(PackedSimd))] private static int LastIndexOfAnyVectorized(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) where TUseFastContains : struct, SearchValues.IRuntimeConst { - Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported); + Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported); Debug.Assert(searchSpaceLength >= 16); ref char cur = ref Unsafe.Add(ref searchSpace, searchSpaceLength); From 80522ca629b8bb8069c7ddb6e2418a7a4a64e078 Mon Sep 17 00:00:00 2001 From: lewing Date: Thu, 25 Jun 2026 00:35:00 -0500 Subject: [PATCH 07/11] Use PackedSimd horizontal ops in Adler32 and XXH3 on Wasm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both algorithms were already vectorized on Wasm via the portable Vector128 else branch (Vector128.Widen + multiply + add), but the result was 3-5 portable ops per iteration where PackedSimd has a direct one-instruction equivalent. Adler32.UpdateVector128: add a PackedSimd branch alongside Sse2 and AdvSimd that uses PackedSimd.AddPairwiseWidening (i16x8.extadd_pair wise_i8x16_u and i32x4.extadd_pairwise_i16x8_u) for the s1 sum and PackedSimd.MultiplyWideningLower/Upper + AddPairwiseWidening for the weighted s2 sum. XxHashShared.MultiplyWideningLower: add a PackedSimd branch that computes { source[0]*source[1], source[2]*source[3] } via two shuffles + i64x2.extmul_low_i32x4_u, replacing the portable mask + 64-bit multiply pair. Validated: System.IO.Hashing.Tests 4196/4196 passing on both host arm64 and browser-wasm (the XxHash lane order is checked end-to-end via the algorithm output bytes — a swap would corrupt every hash). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/System/IO/Hashing/Adler32.cs | 18 +++++++++++++++++ .../src/System/IO/Hashing/XxHashShared.cs | 20 ++++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs index da54cdb3372b19..8b5d04065a544e 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs @@ -8,6 +8,7 @@ #if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; #endif @@ -300,6 +301,23 @@ private static uint UpdateVector128(uint adler, ReadOnlySpan source) wprod2 = AdvSimd.MultiplyWideningUpperAndAdd(wprod2, bytes2, tap2.AsByte()); vs2 = AdvSimd.AddPairwiseWideningAndAdd(vs2, wprod2); } + else if (PackedSimd.IsSupported) + { + // Widening byte sum: each byte -> ushort pair sum -> uint pair sum, then accumulate into vs1. + // Because weights are all positive (1-32), unsigned byte * unsigned byte multiply is valid for vs2. + Vector128 sumPairs1 = PackedSimd.AddPairwiseWidening(bytes1); + Vector128 sumPairs2 = PackedSimd.AddPairwiseWidening(bytes2); + vs1 += PackedSimd.AddPairwiseWidening(sumPairs1) + PackedSimd.AddPairwiseWidening(sumPairs2); + + // bytes * weights -> 8 ushorts low + 8 ushorts high, sum pairwise to 4 uints + 4 uints. + Vector128 wprod1Lo = PackedSimd.MultiplyWideningLower(bytes1, tap1.AsByte()); + Vector128 wprod1Hi = PackedSimd.MultiplyWideningUpper(bytes1, tap1.AsByte()); + vs2 += PackedSimd.AddPairwiseWidening(wprod1Lo) + PackedSimd.AddPairwiseWidening(wprod1Hi); + + Vector128 wprod2Lo = PackedSimd.MultiplyWideningLower(bytes2, tap2.AsByte()); + Vector128 wprod2Hi = PackedSimd.MultiplyWideningUpper(bytes2, tap2.AsByte()); + vs2 += PackedSimd.AddPairwiseWidening(wprod2Lo) + PackedSimd.AddPairwiseWidening(wprod2Hi); + } else { (Vector128 lo1, Vector128 hi1) = Vector128.Widen(bytes1); diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs index 00f1f919a93c70..0f8a6dfa2e7540 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs @@ -9,6 +9,7 @@ #if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; #endif @@ -702,12 +703,25 @@ private static Vector128 MultiplyWideningLower(Vector128 source) Vector64 sourceHigh = Vector128.Shuffle(source, Vector128.Create(1u, 3, 0, 0)).GetLower(); return AdvSimd.MultiplyWideningLower(sourceLow, sourceHigh); } + else if (Sse2.IsSupported) + { + Vector128 sourceLow = Vector128.Shuffle(source, Vector128.Create(1u, 0, 3, 0)); + return Sse2.Multiply(source, sourceLow); + } + else if (PackedSimd.IsSupported) + { + // PackedSimd.MultiplyWideningLower (i64x2.extmul_low_i32x4_u) does + // result[i] = (ulong)a[i] * (ulong)b[i] for i in {0, 1}. + // We need { source[0]*source[1], source[2]*source[3] } to match the Sse2/AdvSimd paths, + // so first move the even lanes into one operand and the odd lanes into the other. + Vector128 evens = Vector128.Shuffle(source, Vector128.Create(0u, 2, 0, 0)); + Vector128 odds = Vector128.Shuffle(source, Vector128.Create(1u, 3, 0, 0)); + return PackedSimd.MultiplyWideningLower(evens, odds); + } else { Vector128 sourceLow = Vector128.Shuffle(source, Vector128.Create(1u, 0, 3, 0)); - return Sse2.IsSupported ? - Sse2.Multiply(source, sourceLow) : - (source & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64() * (sourceLow & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64(); + return (source & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64() * (sourceLow & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64(); } } #endif From 731effa2b3ae0c9a25c003f7b1b47726a8086acf Mon Sep 17 00:00:00 2001 From: lewing Date: Thu, 25 Jun 2026 06:59:53 -0500 Subject: [PATCH 08/11] Replace PackedSimd.Shuffle with ShuffleNative+OR in Vector128.UnpackLow/UnpackHigh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PackedSimd.Shuffle wraps i8x16.shuffle which requires its 16 lane indices to be compile-time constants. Mono interpreter accepted a Vector128.Create() constant operand at runtime, but Mono AOT cannot fold it and throws PlatformNotSupportedException at runtime. The same impact was already known and avoided in TeddyHelper.Right Shift1/RightShift2 (see preceding commit on this branch) — use two Vector128.ShuffleNative calls (lowering to PackedSimd.Swizzle, which clamps out-of-range indices to 0) and OR the partial results together. Apply the same pattern in Vector128.UnpackLow/UnpackHigh. This was caught by CI as 50 GuidTests + cascaded reflection-invoke failures under the WasmTestOnChrome-MONO-ST (AOT) leg on PR #129838. On Mono interpreter all callers (HexConverter, Guid.FormatGuid) had already been validated end-to-end. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/System/Runtime/Intrinsics/Vector128.cs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 987c3c3d30671f..5f0d2d5b60822f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -4401,8 +4401,14 @@ internal static Vector128 UnpackLow(Vector128 left, Vector128 } else if (PackedSimd.IsSupported) { - return PackedSimd.Shuffle(left, right, - Vector128.Create((byte)0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)); + // Compose with two ShuffleNative (PackedSimd.Swizzle, which clamps indices >= 16 to 0) + // plus OR. PackedSimd.Shuffle (two-vector i8x16.shuffle) would require constant lane + // indices and is impractical to call portably from generic code paths. + Vector128 leftPart = Vector128.ShuffleNative(left, + Vector128.Create((byte)0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, 4, 0xFF, 5, 0xFF, 6, 0xFF, 7, 0xFF)); + Vector128 rightPart = Vector128.ShuffleNative(right, + Vector128.Create((byte)0xFF, 0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, 4, 0xFF, 5, 0xFF, 6, 0xFF, 7)); + return leftPart | rightPart; } ThrowHelper.ThrowNotSupportedException(); return default; @@ -4424,8 +4430,11 @@ internal static Vector128 UnpackHigh(Vector128 left, Vector128 } else if (PackedSimd.IsSupported) { - return PackedSimd.Shuffle(left, right, - Vector128.Create((byte)8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)); + Vector128 leftPart = Vector128.ShuffleNative(left, + Vector128.Create((byte)8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, 12, 0xFF, 13, 0xFF, 14, 0xFF, 15, 0xFF)); + Vector128 rightPart = Vector128.ShuffleNative(right, + Vector128.Create((byte)0xFF, 8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, 12, 0xFF, 13, 0xFF, 14, 0xFF, 15)); + return leftPart | rightPart; } ThrowHelper.ThrowNotSupportedException(); return default; From 0e99f4a194c119b624c506e137aec3da642660cf Mon Sep 17 00:00:00 2001 From: lewing Date: Thu, 25 Jun 2026 07:36:10 -0500 Subject: [PATCH 09/11] Call PackedSimd.Swizzle directly in Vector128.Unpack and TeddyHelper.RightShift Both helpers previously dispatched to PackedSimd via Vector128.ShuffleNative, which itself has a Ssse3 -> AdvSimd.Arm64 -> PackedSimd if/else chain. The Mono SIMD intrinsic recognizer does not always lower that chain cleanly for less-traveled paths, surfacing as NIY interpreter assertions and runtime startup failures. Call PackedSimd.Swizzle (i8x16.swizzle) directly under the PackedSimd.IsSupported branch. The semantics are identical to ShuffleNative on Wasm (clamps indices >= 16 to 0) but the lowering goes through a single recognized intrinsic, avoiding the dispatcher chain. Validated: System.Memory.Tests on browser-wasm V8 interpreter 52249/52249 (covers TeddyHelper.RightShift1/2). The original NIY OutOfMemoryException:.ctor failure seen in System.Runtime.Tests with the prior ShuffleNative version is gone with this change. AOT behaviour will be re-validated by CI on push. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/System/Runtime/Intrinsics/Vector128.cs | 17 ++++++++++------- .../SearchValues/Strings/Helpers/TeddyHelper.cs | 14 ++++++++------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 5f0d2d5b60822f..98659953164aec 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -4401,12 +4401,15 @@ internal static Vector128 UnpackLow(Vector128 left, Vector128 } else if (PackedSimd.IsSupported) { - // Compose with two ShuffleNative (PackedSimd.Swizzle, which clamps indices >= 16 to 0) - // plus OR. PackedSimd.Shuffle (two-vector i8x16.shuffle) would require constant lane - // indices and is impractical to call portably from generic code paths. - Vector128 leftPart = Vector128.ShuffleNative(left, + // Compose with two PackedSimd.Swizzle calls (clamp out-of-range to 0) plus OR. + // We call PackedSimd.Swizzle directly rather than Vector128.ShuffleNative because + // the latter goes through a Ssse3 -> AdvSimd.Arm64 -> PackedSimd dispatcher chain + // that the Mono SIMD intrinsic recognizer doesn't always lower cleanly. + // PackedSimd.Shuffle (two-vector i8x16.shuffle) requires constant lane indices + // and is impractical to call portably from generic code paths. + Vector128 leftPart = PackedSimd.Swizzle(left, Vector128.Create((byte)0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, 4, 0xFF, 5, 0xFF, 6, 0xFF, 7, 0xFF)); - Vector128 rightPart = Vector128.ShuffleNative(right, + Vector128 rightPart = PackedSimd.Swizzle(right, Vector128.Create((byte)0xFF, 0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, 4, 0xFF, 5, 0xFF, 6, 0xFF, 7)); return leftPart | rightPart; } @@ -4430,9 +4433,9 @@ internal static Vector128 UnpackHigh(Vector128 left, Vector128 } else if (PackedSimd.IsSupported) { - Vector128 leftPart = Vector128.ShuffleNative(left, + Vector128 leftPart = PackedSimd.Swizzle(left, Vector128.Create((byte)8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, 12, 0xFF, 13, 0xFF, 14, 0xFF, 15, 0xFF)); - Vector128 rightPart = Vector128.ShuffleNative(right, + Vector128 rightPart = PackedSimd.Swizzle(right, Vector128.Create((byte)0xFF, 8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, 12, 0xFF, 13, 0xFF, 14, 0xFF, 15)); return leftPart | rightPart; } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs index 7fae7b0fd35c50..8d6eb67922e4fb 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs @@ -369,11 +369,13 @@ private static Vector128 RightShift1(Vector128 left, Vector128 } else if (PackedSimd.IsSupported) { - // ShuffleNative lowers to PackedSimd.Swizzle which clamps out-of-range - // indices (>= 16) to 0, so we can compose the two halves with an OR. - Vector128 leftPart = Vector128.ShuffleNative(left, + // Call PackedSimd.Swizzle directly (i8x16.swizzle) rather than through + // Vector128.ShuffleNative's dispatcher chain, which the Mono SIMD intrinsic + // recognizer doesn't always lower cleanly. Swizzle clamps out-of-range + // indices (>= 16) to 0 so we can compose the two halves with OR. + Vector128 leftPart = PackedSimd.Swizzle(left, Vector128.Create((byte)15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)); - Vector128 rightPart = Vector128.ShuffleNative(right, + Vector128 rightPart = PackedSimd.Swizzle(right, Vector128.Create((byte)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)); return leftPart | rightPart; } @@ -407,9 +409,9 @@ private static Vector128 RightShift2(Vector128 left, Vector128 } else if (PackedSimd.IsSupported) { - Vector128 leftPart = Vector128.ShuffleNative(left, + Vector128 leftPart = PackedSimd.Swizzle(left, Vector128.Create((byte)14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)); - Vector128 rightPart = Vector128.ShuffleNative(right, + Vector128 rightPart = PackedSimd.Swizzle(right, Vector128.Create((byte)0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)); return leftPart | rightPart; } From 254370263306d9e5aed3b97d66355d93a3327fcb Mon Sep 17 00:00:00 2001 From: lewing Date: Thu, 25 Jun 2026 07:47:43 -0500 Subject: [PATCH 10/11] Update Utf8Utility.Validation comment for Wasm Reword the LoopTerminatedEarlyDueToNonAsciiData label comment to mention that Wasm is also a little-endian-only platform reaching this point through the PackedSimd branch added earlier in this PR. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/System/Text/Unicode/Utf8Utility.Validation.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 5388c6c68d0052..dff9af0fdcac9d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -190,8 +190,8 @@ internal static unsafe partial class Utf8Utility #if NET LoopTerminatedEarlyDueToNonAsciiData: - // x86 can only be little endian, while ARM can be big or little endian - // so if we reached this label we need to check both combinations are supported + // x86 and Wasm can only be little endian, while ARM can be big or little endian, + // so if we reached this label we need to check the LE-restricted combinations as well. Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported || PackedSimd.IsSupported); From ec32de08b80d33601e50cdefebf57b94328f45ab Mon Sep 17 00:00:00 2001 From: lewing Date: Thu, 25 Jun 2026 08:01:36 -0500 Subject: [PATCH 11/11] Drop redundant '&& BitConverter.IsLittleEndian' from PackedSimd gate Wasm is always little-endian by spec, so the BitConverter.IsLittle Endian check on the PackedSimd.IsSupported branch is a no-op. Keep the check on the AdvSimd.Arm64 branch where it actually matters (NEON can be big- or little-endian on some configurations). Mirrors how the existing Sse41.X64 branch in the same gate has no LE check (x86-64 is also always little-endian). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/System/Text/Unicode/Utf8Utility.Transcoding.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs index 7931269a0e5c60..581d231076678f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -885,7 +885,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt #if NET Vector128 nonAsciiUtf16DataMask; - if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || (PackedSimd.IsSupported && BitConverter.IsLittleEndian)) + if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || PackedSimd.IsSupported) { nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char } @@ -945,7 +945,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); #if NET - if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || (PackedSimd.IsSupported && BitConverter.IsLittleEndian)) + if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || PackedSimd.IsSupported) { // Try reading and writing 8 elements per iteration. uint maxIters = minElementsRemaining / 8;