From c69998277eb6662b6fb80ffa864de99aed75eda3 Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Wed, 24 Jun 2026 23:53:05 -0500
Subject: [PATCH 01/11] Add Wasm PackedSimd path to
 Vector128.UnpackLow/UnpackHigh

These internal helpers are used by HexConverter.AsciiToHexVector128
and other byte-interleaving code paths. They previously dispatched
only to Sse2.UnpackLow/UnpackHigh or AdvSimd.Arm64.ZipLow/ZipHigh
and threw NotSupportedException on platforms without either ISA.

With the recent change that enables HexConverter and Guid format on
Wasm via PackedSimd, the helpers became reachable on browser-wasm
and started throwing at runtime in libraries tests.

Lower to PackedSimd.Shuffle with a constant 16-byte index vector
(i8x16.shuffle) when PackedSimd is supported. Validated via
System.Runtime.Extensions.Tests on browser-wasm (8224 passing, 0
failed) after the previous run failed 132 tests.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../System/Runtime/Intrinsics/Vector128.cs    | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
index 16f46daddd7eab..987c3c3d30671f 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
@@ -4388,33 +4388,47 @@ internal static void SetElementUnsafe<T>(in this Vector128<T> vector, int index,
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
         [CompExactlyDependsOn(typeof(Sse2))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         internal static Vector128<byte> UnpackLow(Vector128<byte> left, Vector128<byte> right)
         {
             if (Sse2.IsSupported)
             {
                 return Sse2.UnpackLow(left, right);
             }
-            else if (!AdvSimd.Arm64.IsSupported)
+            else if (AdvSimd.Arm64.IsSupported)
             {
-                ThrowHelper.ThrowNotSupportedException();
+                return AdvSimd.Arm64.ZipLow(left, right);
+            }
+            else if (PackedSimd.IsSupported)
+            {
+                return PackedSimd.Shuffle(left, right,
+                    Vector128.Create((byte)0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23));
             }
-            return AdvSimd.Arm64.ZipLow(left, right);
+            ThrowHelper.ThrowNotSupportedException();
+            return default;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
         [CompExactlyDependsOn(typeof(Sse2))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         internal static Vector128<byte> UnpackHigh(Vector128<byte> left, Vector128<byte> right)
         {
             if (Sse2.IsSupported)
             {
                 return Sse2.UnpackHigh(left, right);
             }
-            else if (!AdvSimd.Arm64.IsSupported)
+            else if (AdvSimd.Arm64.IsSupported)
             {
-                ThrowHelper.ThrowNotSupportedException();
+                return AdvSimd.Arm64.ZipHigh(left, right);
+            }
+            else if (PackedSimd.IsSupported)
+            {
+                return PackedSimd.Shuffle(left, right,
+                    Vector128.Create((byte)8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31));
             }
-            return AdvSimd.Arm64.ZipHigh(left, right);
+            ThrowHelper.ThrowNotSupportedException();
+            return default;
         }
     }
 }

From 3dc956b6465a6d8b2d36cf0be10c894048002c45 Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Wed, 24 Jun 2026 23:34:53 -0500
Subject: [PATCH 02/11] Enable Vector128 hex/Guid format fast path on Wasm

HexConverter.AsciiToHexVector128, HexConverter.EncodeTo_Vector128 and
Guid.FormatGuidVector128Utf8 already use only portable Vector128 ops
(Vector128.ShuffleNative, Vector128.UnpackLow/High, Vector128.Shuffle
with constant indices) plus an optional AdvSimd.Arm64-specific branch.
The gates at Convert.ToHexString, EncodeToUtf8/Utf16, and Guid.ToString
required Ssse3 or AdvSimd.Arm64, so Wasm fell back to scalar even with
PackedSimd.

Add PackedSimd.IsSupported to the gates and the [CompExactlyDependsOn]
attributes on the helpers. The bodies are unchanged; on Wasm the
existing else branch (portable Vector128.Shuffle) is selected.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/libraries/Common/src/System/HexConverter.cs         | 8 +++++---
 src/libraries/System.Private.CoreLib/src/System/Guid.cs | 6 ++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/libraries/Common/src/System/HexConverter.cs b/src/libraries/Common/src/System/HexConverter.cs
index 10316e0c610325..f0a051393f6660 100644
--- a/src/libraries/Common/src/System/HexConverter.cs
+++ b/src/libraries/Common/src/System/HexConverter.cs
@@ -96,9 +96,10 @@ public static void ToCharsBuffer(byte value, Span<char> buffer, int startingInde
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         internal static (Vector128<byte>, Vector128<byte>) AsciiToHexVector128(Vector128<byte> src, Vector128<byte> hexMap)
         {
-            Debug.Assert(Ssse3.IsSupported || AdvSimd.Arm64.IsSupported);
+            Debug.Assert(Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported);
 
             // The algorithm is simple: a single srcVec (contains the whole 16b Guid) is converted
             // into nibbles and then, via hexMap, converted into a HEX representation via
@@ -115,6 +116,7 @@ internal static (Vector128<byte>, Vector128<byte>) AsciiToHexVector128(Vector128
 
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static void EncodeTo_Vector128<TChar>(ReadOnlySpan<byte> source, Span<TChar> destination, Casing casing)
         {
             Debug.Assert(source.Length >= (Vector128<TChar>.Count / 2));
@@ -187,7 +189,7 @@ public static void EncodeToUtf8(ReadOnlySpan<byte> source, Span<byte> utf8Destin
             Debug.Assert(utf8Destination.Length >= (source.Length * 2));
 
 #if SYSTEM_PRIVATE_CORELIB
-            if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported) && (source.Length >= (Vector128<byte>.Count / 2)))
+            if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported || PackedSimd.IsSupported) && (source.Length >= (Vector128<byte>.Count / 2)))
             {
                 EncodeTo_Vector128(source, utf8Destination, casing);
                 return;
@@ -204,7 +206,7 @@ public static void EncodeToUtf16(ReadOnlySpan<byte> source, Span<char> destinati
             Debug.Assert(destination.Length >= (source.Length * 2));
 
 #if SYSTEM_PRIVATE_CORELIB
-            if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported) && (source.Length >= (Vector128<ushort>.Count / 2)))
+            if ((AdvSimd.Arm64.IsSupported || Ssse3.IsSupported || PackedSimd.IsSupported) && (source.Length >= (Vector128<ushort>.Count / 2)))
             {
                 EncodeTo_Vector128(source, Unsafe.BitCast<Span<char>, Span<ushort>>(destination), casing);
                 return;
diff --git a/src/libraries/System.Private.CoreLib/src/System/Guid.cs b/src/libraries/System.Private.CoreLib/src/System/Guid.cs
index 60b6b289991ca3..1ac2e1ccf24451 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Guid.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Guid.cs
@@ -10,6 +10,7 @@
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 using System.Runtime.Versioning;
 using System.Text;
@@ -1345,7 +1346,7 @@ internal unsafe bool TryFormatCore<TChar>(Span<TChar> destination, out int chars
                 }
                 flags >>= 8;
 
-                if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian)
+                if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && BitConverter.IsLittleEndian)
                 {
                     // Vectorized implementation for D, N, P and B formats:
                     // [{|(]dddddddd[-]dddd[-]dddd[-]dddd[-]dddddddddddd[}|)]
@@ -1513,9 +1514,10 @@ static void WriteHex(Span<TChar> dest, int offset, int val, bool appendComma = t
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static (Vector128<byte>, Vector128<byte>, Vector128<byte>) FormatGuidVector128Utf8(Guid value, bool useDashes)
         {
-            Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian);
+            Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && BitConverter.IsLittleEndian);
             // Vectorized implementation for D, N, P and B formats:
             // [{|(]dddddddd[-]dddd[-]dddd[-]dddd[-]dddddddddddd[}|)]
 

From 061b22bffcf212154b7d75a30ba8d15dc91ce22a Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Wed, 24 Jun 2026 23:35:37 -0500
Subject: [PATCH 03/11] Vectorize Utf8Utility.Validation ASCII fast path on
 Wasm

GetPointerToFirstInvalidChar's inner ASCII-scan loop dispatched on
AdvSimd.Arm64 (with bitmask128) or Sse2 (with MoveMask), falling back
to a scalar 4-DWORD-at-a-time path otherwise. On Wasm with PackedSimd,
neither SIMD branch was taken, so UTF-8 validation took the scalar
path.

Add a PackedSimd.IsSupported branch that uses portable
Vector128.LoadUnsafe + ExtractMostSignificantBits to compute the same
per-byte non-ASCII bitmask used by the Sse2 path. Update the post-loop
Debug.Assert to include PackedSimd.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../System/Text/Unicode/Utf8Utility.Validation.cs    | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index 0717ed348af4ed..5388c6c68d0052 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -9,6 +9,7 @@
 #if NET
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 #endif
 
@@ -158,6 +159,15 @@ internal static unsafe partial class Utf8Utility
                                         goto LoopTerminatedEarlyDueToNonAsciiData;
                                     }
                                 }
+                                else if (PackedSimd.IsSupported)
+                                {
+                                    uint mask = Vector128.LoadUnsafe(ref *pInputBuffer).ExtractMostSignificantBits();
+                                    if (mask != 0)
+                                    {
+                                        trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask);
+                                        goto LoopTerminatedEarlyDueToNonAsciiData;
+                                    }
+                                }
                                 else
 #endif
                                 {
@@ -182,7 +192,7 @@ internal static unsafe partial class Utf8Utility
                     LoopTerminatedEarlyDueToNonAsciiData:
                         // x86 can only be little endian, while ARM can be big or little endian
                         // so if we reached this label we need to check both combinations are supported
-                        Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported);
+                        Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported || PackedSimd.IsSupported);
 
 
                         // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit

From da08d905c6e922246bddec1b1f11ca08be530928 Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Thu, 25 Jun 2026 00:07:33 -0500
Subject: [PATCH 04/11] Vectorize Utf8Utility.Transcoding ASCII fast path on
 Wasm

TranscodeToUtf8's 8-char ASCII fast loop used a Vector128<short> read,
a mask-and-compare to detect non-ASCII, and a narrow-and-store of 8
bytes using Sse2.PackUnsignedSaturate / AdvSimd.ExtractNarrowingSatura
teUnsignedLower. Two follow-on 4-char sites narrowed 4 bytes the same
way. All four sites required Sse41.X64 or AdvSimd.Arm64 + LE, so Wasm
took the 4-DWORD-at-a-time scalar fallback.

Add PackedSimd branches at every dispatch site:
 - Outer entry gate (declaration + entry condition)
 - 8-char narrow-store: use the existing portable AND-compare for the
   non-ASCII test (same code Sse41 already uses) and PackedSimd.Convert
   NarrowingSaturateUnsigned + scalar extract for the store
 - 4-char narrow-stores: PackedSimd.ConvertNarrowingSaturateUnsigned +
   AsUInt32().ToScalar() unaligned write

The Sse2.X64.ConvertToUInt64 sub-branch already had an else path that
calls AsUInt64().ToScalar(), which works on Wasm without changes.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../Text/Unicode/Utf8Utility.Transcoding.cs   | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs
index ff095110653805..7931269a0e5c60 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs
@@ -10,6 +10,7 @@
 #if NET
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 #endif
 
@@ -884,7 +885,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
 #if NET
             Vector128<short> nonAsciiUtf16DataMask;
 
-            if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
+            if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || (PackedSimd.IsSupported && BitConverter.IsLittleEndian))
             {
                 nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char
             }
@@ -944,7 +945,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                     uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
 
 #if NET
-                    if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
+                    if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || (PackedSimd.IsSupported && BitConverter.IsLittleEndian))
                     {
                         // Try reading and writing 8 elements per iteration.
                         uint maxIters = minElementsRemaining / 8;
@@ -982,6 +983,17 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                                 // narrow and write
                                 Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64());
                             }
+                            else if (PackedSimd.IsSupported)
+                            {
+                                if ((utf16Data & nonAsciiUtf16DataMask) != Vector128<short>.Zero)
+                                {
+                                    goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
+                                }
+
+                                // narrow and write low 8 bytes
+                                Vector128<byte> narrowed = PackedSimd.ConvertNarrowingSaturateUnsigned(utf16Data, utf16Data);
+                                Unsafe.WriteUnaligned<ulong>(pOutputBuffer, narrowed.AsUInt64().ToScalar());
+                            }
                             else
                             {
                                 // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead
@@ -1015,6 +1027,11 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                             {
                                 Unsafe.WriteUnaligned(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
                             }
+                            else if (PackedSimd.IsSupported)
+                            {
+                                Vector128<byte> narrowed = PackedSimd.ConvertNarrowingSaturateUnsigned(utf16Data, utf16Data);
+                                Unsafe.WriteUnaligned<uint>(pOutputBuffer, narrowed.AsUInt32().ToScalar());
+                            }
                             else
                             {
                                 // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead
@@ -1058,6 +1075,11 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                             {
                                 Unsafe.WriteUnaligned(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
                             }
+                            else if (PackedSimd.IsSupported)
+                            {
+                                Vector128<byte> narrowed = PackedSimd.ConvertNarrowingSaturateUnsigned(utf16Data, utf16Data);
+                                Unsafe.WriteUnaligned<uint>(pOutputBuffer, narrowed.AsUInt32().ToScalar());
+                            }
                             else
                             {
                                 // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead

From 2caa5e259971daa85d4c9f8f3604baf43b2da1ab Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Thu, 25 Jun 2026 00:22:24 -0500
Subject: [PATCH 05/11] Enable Teddy multi-string search on Wasm

SearchValues<string> with 2+ values previously selected the
Aho-Corasick implementation on Wasm because the Teddy entry gate in
StringSearchValues.cs required Ssse3 or AdvSimd.Arm64. Teddy's core
Vector128 primitives in TeddyHelper.cs (LoadAndPack16AsciiChars, the
nibble GetNibbles helper, the two-table Shuffle, and RightShift1/2)
similarly excluded PackedSimd.

Add PackedSimd branches throughout:
 - LoadAndPack16AsciiChars: PackedSimd.ConvertNarrowingSaturateUnsigned
 - GetNibbles: PackedSimd needs the explicit '& 0xF' on the low half
   because Swizzle returns 0 for indices >= 16 (unlike Ssse3's
   implicit AND of the low 4 bits)
 - Shuffle: already uses portable Vector128.ShuffleNative which maps
   to PackedSimd.Swizzle; just widen the [CompExactlyDependsOn]
 - RightShift1/RightShift2: compose two Vector128.ShuffleNative calls
   with constant index vectors and OR the halves. PackedSimd.Shuffle
   (two-vector i8x16.shuffle) is impractical due to constant lane
   index requirements; Swizzle clamps out-of-range to 0 which makes
   the OR safe.

Widen the entry gate in StringSearchValues.cs.CreateFromNormalizedV
alues and the null-char filter in TryGetTeddyAcceleratedValues
(PackedSimd shares Ssse3's PackUnsignedSaturate behavior where signed
negative inputs become 0, so null-containing needles produce more
false positives on both).

Widen [CompExactlyDependsOn] on the IndexOfAnyN2/N3 + Vector128
helpers in AsciiStringSearchValuesTeddyBase.cs.

Validated: System.Memory.Tests on browser-wasm 52249/52249 passing
(covers SearchValues<string> Teddy paths via StringSearchValues
tests), host 52905/52906 unchanged.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../AsciiStringSearchValuesTeddyBase.cs       |  5 +++
 .../Strings/Helpers/TeddyHelper.cs            | 34 ++++++++++++++++++-
 .../Strings/StringSearchValues.cs             | 10 +++---
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs
index 0869ad1553f895..2d96c3009f9fdc 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs
@@ -8,6 +8,7 @@
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 using static System.Buffers.StringSearchValuesHelper;
 using static System.Buffers.TeddyHelper;
@@ -150,6 +151,7 @@ protected AsciiStringSearchValuesTeddyBase(string[][] buckets, ReadOnlySpan<stri
 
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         protected int IndexOfAnyN2(ReadOnlySpan<char> span)
         {
             // The behavior of the rest of the function remains the same if Avx2 or Avx512BW aren't supported
@@ -170,6 +172,7 @@ protected int IndexOfAnyN2(ReadOnlySpan<char> span)
 
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         protected int IndexOfAnyN3(ReadOnlySpan<char> span)
         {
             // The behavior of the rest of the function remains the same if Avx2 or Avx512BW aren't supported
@@ -190,6 +193,7 @@ protected int IndexOfAnyN3(ReadOnlySpan<char> span)
 
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private int IndexOfAnyN2Vector128(ReadOnlySpan<char> span)
         {
             // See comments in 'IndexOfAnyN3Vector128' below.
@@ -350,6 +354,7 @@ private int IndexOfAnyN2Avx512(ReadOnlySpan<char> span)
 
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private int IndexOfAnyN3Vector128(ReadOnlySpan<char> span)
         {
             // We can't process inputs shorter than 18 characters in a vectorized manner here.
diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs
index 064838c98cf361..7fae7b0fd35c50 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs
@@ -4,6 +4,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 
 namespace System.Buffers
@@ -17,6 +18,7 @@ internal static class TeddyHelper
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         public static (Vector128<byte> Result, Vector128<byte> Prev0) ProcessInputN2(
             Vector128<byte> input,
             Vector128<byte> prev0,
@@ -90,6 +92,7 @@ public static (Vector512<byte> Result, Vector512<byte> Prev0) ProcessInputN2(
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         public static (Vector128<byte> Result, Vector128<byte> Prev0, Vector128<byte> Prev1) ProcessInputN3(
             Vector128<byte> input,
             Vector128<byte> prev0, Vector128<byte> prev1,
@@ -216,6 +219,7 @@ public static (Vector512<byte> Result, Vector512<byte> Prev0, Vector512<byte> Pr
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Sse2))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         public static Vector128<byte> LoadAndPack16AsciiChars(ref char source)
         {
             Vector128<ushort> source0 = Vector128.LoadUnsafe(ref source);
@@ -229,6 +233,10 @@ public static Vector128<byte> LoadAndPack16AsciiChars(ref char source)
             {
                 return AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte());
             }
+            else if (PackedSimd.IsSupported)
+            {
+                return PackedSimd.ConvertNarrowingSaturateUnsigned(source0.AsInt16(), source1.AsInt16());
+            }
             else
             {
                 // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead
@@ -276,10 +284,13 @@ public static Vector512<byte> LoadAndPack64AsciiChars(ref char source)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static (Vector128<byte> Low, Vector128<byte> High) GetNibbles(Vector128<byte> input)
         {
             // 'low' is not strictly correct here, but we take advantage of Ssse3.Shuffle's behavior
-            // of doing an implicit 'AND 0xF' in order to skip the redundant AND.
+            // of doing an implicit 'AND 0xF' in order to skip the redundant AND. PackedSimd.Swizzle
+            // and AdvSimd's table lookup return 0 for indices >= 16 (instead of masking the low 4
+            // bits), so they need the explicit AND.
             Vector128<byte> low = Ssse3.IsSupported
                 ? input
                 : input & Vector128.Create((byte)0xF);
@@ -316,6 +327,7 @@ private static (Vector512<byte> Low, Vector512<byte> High) GetNibbles(Vector512<
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static Vector128<byte> Shuffle(Vector128<byte> maskLow, Vector128<byte> maskHigh, Vector128<byte> low, Vector128<byte> high)
         {
             return SearchValues.ShuffleNativeModified(maskLow, low) & Vector128.ShuffleNative(maskHigh, high);
@@ -338,6 +350,7 @@ private static Vector512<byte> Shuffle(Vector512<byte> maskLow, Vector512<byte>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static Vector128<byte> RightShift1(Vector128<byte> left, Vector128<byte> right)
         {
             // Given input vectors like
@@ -354,6 +367,16 @@ private static Vector128<byte> RightShift1(Vector128<byte> left, Vector128<byte>
             {
                 return AdvSimd.ExtractVector128(left, right, 15);
             }
+            else if (PackedSimd.IsSupported)
+            {
+                // ShuffleNative lowers to PackedSimd.Swizzle which clamps out-of-range
+                // indices (>= 16) to 0, so we can compose the two halves with an OR.
+                Vector128<byte> leftPart = Vector128.ShuffleNative(left,
+                    Vector128.Create((byte)15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
+                Vector128<byte> rightPart = Vector128.ShuffleNative(right,
+                    Vector128.Create((byte)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14));
+                return leftPart | rightPart;
+            }
             else
             {
                 // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead
@@ -365,6 +388,7 @@ private static Vector128<byte> RightShift1(Vector128<byte> left, Vector128<byte>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(Ssse3))]
         [CompExactlyDependsOn(typeof(AdvSimd))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static Vector128<byte> RightShift2(Vector128<byte> left, Vector128<byte> right)
         {
             // Given input vectors like
@@ -381,6 +405,14 @@ private static Vector128<byte> RightShift2(Vector128<byte> left, Vector128<byte>
             {
                 return AdvSimd.ExtractVector128(left, right, 14);
             }
+            else if (PackedSimd.IsSupported)
+            {
+                Vector128<byte> leftPart = Vector128.ShuffleNative(left,
+                    Vector128.Create((byte)14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
+                Vector128<byte> rightPart = Vector128.ShuffleNative(right,
+                    Vector128.Create((byte)0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13));
+                return leftPart | rightPart;
+            }
             else
             {
                 // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead
diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
index a9ac9452c215cb..6c95f892366bd6 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
@@ -7,6 +7,7 @@
 using System.Globalization;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 using System.Text;
 using System.Text.Unicode;
@@ -128,7 +129,7 @@ private static SearchValues<string> CreateFromNormalizedValues(
                 return CreateForSingleValue(values[0], uniqueValues, ignoreCase, allAscii, asciiLettersOnly);
             }
 
-            if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) &&
+            if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) &&
                 TryGetTeddyAcceleratedValues(values, uniqueValues, ignoreCase, allAscii, asciiLettersOnly, nonAsciiAffectedByCaseConversion, minLength) is { } searchValues)
             {
                 return searchValues;
@@ -198,7 +199,7 @@ static SearchValues<string> PickAhoCorasickImplementation<TCaseSensitivity>(AhoC
 
             int n = minLength == 2 ? 2 : 3;
 
-            if (Ssse3.IsSupported)
+            if (Ssse3.IsSupported || PackedSimd.IsSupported)
             {
                 foreach (string value in values)
                 {
@@ -206,8 +207,9 @@ static SearchValues<string> PickAhoCorasickImplementation<TCaseSensitivity>(AhoC
                     {
                         // If we let null chars through here, Teddy would still work correctly, but it
                         // would hit more false positives that the verification step would have to rule out.
-                        // While we could flow a generic flag like Ssse3AndWasmHandleZeroInNeedle through,
-                        // we expect such values to be rare enough that introducing more code is not worth it.
+                        // Ssse3.PackUnsignedSaturate and PackedSimd.ConvertNarrowingSaturateUnsigned both
+                        // treat negative signed-16 values as 0, so we filter out null-containing needles
+                        // for both to avoid that source of false positives.
                         return null;
                     }
                 }

From 819c9fe3960296005519f7f74797a38c71269a64 Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Thu, 25 Jun 2026 00:32:05 -0500
Subject: [PATCH 06/11] Enable ProbabilisticMap vectorized SearchValues<char>
 on Wasm

SearchValues<char> with values that span more than the ASCII range
selects a ProbabilisticMap-based search. The vectorized IndexOfAny /
LastIndexOfAny path (using ContainsMask16Chars + IsCharBitNotSet)
was previously gated on Sse41 || AdvSimd.Arm64 only, so on Wasm the
search fell back to the scalar SimpleLoop even when PackedSimd was
available.

This change is subtler than the other enablement PRs because the
*layout* of the ProbabilisticMap bitmap also branches on the same
gate (SetCharBit/IsCharBitSet at the top of the file). The
[BypassReadyToRun] comment there warns that the construction and
lookup branches must agree at all times during program execution.
Widen all three gates (SetCharBit/IsCharBitSet, ContainsMask16Chars,
the IndexOfAny/LastIndexOfAny entry dispatcher, and the [CompExactly
DependsOn] on the Vector128 worker methods) to include PackedSimd
consistently.

ContainsMask16Chars gets a PackedSimd branch that mirrors the Sse2
algorithm using PackedSimd.ConvertNarrowingSaturateUnsigned for the
two-vector narrowing step. IsCharBitNotSet already had a PackedSimd
dependency for the table lookup via Vector128.ShuffleNative.

ProbabilisticWithAsciiCharSearchValues already had PackedSimd dispatch.

Validated: System.Memory.Tests on browser-wasm 52249/52249 passing,
host arm64 52905/52906 unchanged.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../System/SearchValues/ProbabilisticMap.cs   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs
index 0b09377b503e17..7a895c1a7d3b08 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs
@@ -80,7 +80,7 @@ public ProbabilisticMap(ReadOnlySpan<char> values)
         [BypassReadyToRun]
         private static void SetCharBit(ref uint charMap, byte value)
         {
-            if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported)
+            if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported)
             {
                 Unsafe.Add(ref Unsafe.As<uint, byte>(ref charMap), value & VectorizedIndexMask) |= (byte)(1u << (value >> VectorizedIndexShift));
             }
@@ -92,7 +92,7 @@ private static void SetCharBit(ref uint charMap, byte value)
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [BypassReadyToRun]
-        private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported
+        private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported
             ? (Unsafe.Add(ref Unsafe.As<uint, byte>(ref charMap), value & VectorizedIndexMask) & (1u << (value >> VectorizedIndexShift))) != 0
             : (Unsafe.Add(ref charMap, value & PortableIndexMask) & (1u << (value >> PortableIndexShift))) != 0;
 
@@ -220,6 +220,7 @@ private static Vector256<byte> IsCharBitNotSetAvx2(Vector256<byte> charMapLower,
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
         [CompExactlyDependsOn(typeof(Sse2))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static Vector128<byte> ContainsMask16Chars(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, ref char searchSpace)
         {
             Vector128<ushort> source0 = Vector128.LoadUnsafe(ref searchSpace);
@@ -238,6 +239,11 @@ private static Vector128<byte> ContainsMask16Chars(Vector128<byte> charMapLower,
                 sourceLower = AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte());
                 sourceUpper = AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte());
             }
+            else if (PackedSimd.IsSupported)
+            {
+                sourceLower = PackedSimd.ConvertNarrowingSaturateUnsigned((source0 & Vector128.Create((ushort)255)).AsInt16(), (source1 & Vector128.Create((ushort)255)).AsInt16());
+                sourceUpper = PackedSimd.ConvertNarrowingSaturateUnsigned((source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16());
+            }
             else
             {
                 // We explicitly recheck each IsSupported query to ensure that the trimmer can see which paths are live/dead
@@ -392,7 +398,7 @@ private static unsafe int ProbabilisticLastIndexOfAny(ref char searchSpace, int
         internal static int IndexOfAny<TUseFastContains>(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state)
             where TUseFastContains : struct, SearchValues.IRuntimeConst
         {
-            if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && searchSpaceLength >= 16)
+            if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && searchSpaceLength >= 16)
             {
                 return Vector512.IsHardwareAccelerated && Avx512Vbmi.VL.IsSupported
                     ? IndexOfAnyVectorizedAvx512<TUseFastContains>(ref searchSpace, searchSpaceLength, ref state)
@@ -406,7 +412,7 @@ internal static int IndexOfAny<TUseFastContains>(ref char searchSpace, int searc
         internal static int LastIndexOfAny<TUseFastContains>(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state)
             where TUseFastContains : struct, SearchValues.IRuntimeConst
         {
-            if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && searchSpaceLength >= 16)
+            if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported) && searchSpaceLength >= 16)
             {
                 return Vector512.IsHardwareAccelerated && Avx512Vbmi.VL.IsSupported
                     ? LastIndexOfAnyVectorizedAvx512<TUseFastContains>(ref searchSpace, searchSpaceLength, ref state)
@@ -501,10 +507,11 @@ private static int IndexOfAnyVectorizedAvx512<TUseFastContains>(ref char searchS
 
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
         [CompExactlyDependsOn(typeof(Sse41))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static int IndexOfAnyVectorized<TUseFastContains>(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state)
             where TUseFastContains : struct, SearchValues.IRuntimeConst
         {
-            Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported);
+            Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported);
             Debug.Assert(searchSpaceLength >= 16);
 
             ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
@@ -679,10 +686,11 @@ private static int LastIndexOfAnyVectorizedAvx512<TUseFastContains>(ref char sea
 
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
         [CompExactlyDependsOn(typeof(Sse41))]
+        [CompExactlyDependsOn(typeof(PackedSimd))]
         private static int LastIndexOfAnyVectorized<TUseFastContains>(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state)
             where TUseFastContains : struct, SearchValues.IRuntimeConst
         {
-            Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported);
+            Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported);
             Debug.Assert(searchSpaceLength >= 16);
 
             ref char cur = ref Unsafe.Add(ref searchSpace, searchSpaceLength);

From 80522ca629b8bb8069c7ddb6e2418a7a4a64e078 Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Thu, 25 Jun 2026 00:35:00 -0500
Subject: [PATCH 07/11] Use PackedSimd horizontal ops in Adler32 and XXH3 on
 Wasm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both algorithms were already vectorized on Wasm via the portable
Vector128 else branch (Vector128.Widen + multiply + add), but the
result was 3-5 portable ops per iteration where PackedSimd has a
direct one-instruction equivalent.

Adler32.UpdateVector128: add a PackedSimd branch alongside Sse2 and
AdvSimd that uses PackedSimd.AddPairwiseWidening (i16x8.extadd_pair
wise_i8x16_u and i32x4.extadd_pairwise_i16x8_u) for the s1 sum and
PackedSimd.MultiplyWideningLower/Upper + AddPairwiseWidening for the
weighted s2 sum.

XxHashShared.MultiplyWideningLower: add a PackedSimd branch that
computes { source[0]*source[1], source[2]*source[3] } via two
shuffles + i64x2.extmul_low_i32x4_u, replacing the portable
mask + 64-bit multiply pair.

Validated: System.IO.Hashing.Tests 4196/4196 passing on both host
arm64 and browser-wasm (the XxHash lane order is checked end-to-end
via the algorithm output bytes — a swap would corrupt every hash).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/System/IO/Hashing/Adler32.cs          | 18 +++++++++++++++++
 .../src/System/IO/Hashing/XxHashShared.cs     | 20 ++++++++++++++++---
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs
index da54cdb3372b19..8b5d04065a544e 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs
@@ -8,6 +8,7 @@
 #if NET
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 #endif
 
@@ -300,6 +301,23 @@ private static uint UpdateVector128(uint adler, ReadOnlySpan<byte> source)
                         wprod2 = AdvSimd.MultiplyWideningUpperAndAdd(wprod2, bytes2, tap2.AsByte());
                         vs2 = AdvSimd.AddPairwiseWideningAndAdd(vs2, wprod2);
                     }
+                    else if (PackedSimd.IsSupported)
+                    {
+                        // Widening byte sum: each byte -> ushort pair sum -> uint pair sum, then accumulate into vs1.
+                        // Because weights are all positive (1-32), unsigned byte * unsigned byte multiply is valid for vs2.
+                        Vector128<ushort> sumPairs1 = PackedSimd.AddPairwiseWidening(bytes1);
+                        Vector128<ushort> sumPairs2 = PackedSimd.AddPairwiseWidening(bytes2);
+                        vs1 += PackedSimd.AddPairwiseWidening(sumPairs1) + PackedSimd.AddPairwiseWidening(sumPairs2);
+
+                        // bytes * weights -> 8 ushorts low + 8 ushorts high, sum pairwise to 4 uints + 4 uints.
+                        Vector128<ushort> wprod1Lo = PackedSimd.MultiplyWideningLower(bytes1, tap1.AsByte());
+                        Vector128<ushort> wprod1Hi = PackedSimd.MultiplyWideningUpper(bytes1, tap1.AsByte());
+                        vs2 += PackedSimd.AddPairwiseWidening(wprod1Lo) + PackedSimd.AddPairwiseWidening(wprod1Hi);
+
+                        Vector128<ushort> wprod2Lo = PackedSimd.MultiplyWideningLower(bytes2, tap2.AsByte());
+                        Vector128<ushort> wprod2Hi = PackedSimd.MultiplyWideningUpper(bytes2, tap2.AsByte());
+                        vs2 += PackedSimd.AddPairwiseWidening(wprod2Lo) + PackedSimd.AddPairwiseWidening(wprod2Hi);
+                    }
                     else
                     {
                         (Vector128<ushort> lo1, Vector128<ushort> hi1) = Vector128.Widen(bytes1);
diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs
index 00f1f919a93c70..0f8a6dfa2e7540 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs
@@ -9,6 +9,7 @@
 #if NET
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 #endif
 
@@ -702,12 +703,25 @@ private static Vector128<ulong> MultiplyWideningLower(Vector128<uint> source)
                 Vector64<uint> sourceHigh = Vector128.Shuffle(source, Vector128.Create(1u, 3, 0, 0)).GetLower();
                 return AdvSimd.MultiplyWideningLower(sourceLow, sourceHigh);
             }
+            else if (Sse2.IsSupported)
+            {
+                Vector128<uint> sourceLow = Vector128.Shuffle(source, Vector128.Create(1u, 0, 3, 0));
+                return Sse2.Multiply(source, sourceLow);
+            }
+            else if (PackedSimd.IsSupported)
+            {
+                // PackedSimd.MultiplyWideningLower (i64x2.extmul_low_i32x4_u) does
+                // result[i] = (ulong)a[i] * (ulong)b[i] for i in {0, 1}.
+                // We need { source[0]*source[1], source[2]*source[3] } to match the Sse2/AdvSimd paths,
+                // so first move the even lanes into one operand and the odd lanes into the other.
+                Vector128<uint> evens = Vector128.Shuffle(source, Vector128.Create(0u, 2, 0, 0));
+                Vector128<uint> odds = Vector128.Shuffle(source, Vector128.Create(1u, 3, 0, 0));
+                return PackedSimd.MultiplyWideningLower(evens, odds);
+            }
             else
             {
                 Vector128<uint> sourceLow = Vector128.Shuffle(source, Vector128.Create(1u, 0, 3, 0));
-                return Sse2.IsSupported ?
-                    Sse2.Multiply(source, sourceLow) :
-                    (source & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64() * (sourceLow & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64();
+                return (source & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64() * (sourceLow & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64();
             }
         }
 #endif

From 731effa2b3ae0c9a25c003f7b1b47726a8086acf Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Thu, 25 Jun 2026 06:59:53 -0500
Subject: [PATCH 08/11] Replace PackedSimd.Shuffle with ShuffleNative+OR in
 Vector128.UnpackLow/UnpackHigh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PackedSimd.Shuffle wraps i8x16.shuffle which requires its 16 lane
indices to be compile-time constants. Mono interpreter accepted a
Vector128.Create() constant operand at runtime, but Mono AOT cannot
fold it and throws PlatformNotSupportedException at runtime.

The same impact was already known and avoided in TeddyHelper.Right
Shift1/RightShift2 (see preceding commit on this branch) — use two
Vector128.ShuffleNative calls (lowering to PackedSimd.Swizzle, which
clamps out-of-range indices to 0) and OR the partial results
together. Apply the same pattern in Vector128.UnpackLow/UnpackHigh.

This was caught by CI as 50 GuidTests + cascaded reflection-invoke
failures under the WasmTestOnChrome-MONO-ST (AOT) leg on PR #129838.
On Mono interpreter all callers (HexConverter, Guid.FormatGuid) had
already been validated end-to-end.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/System/Runtime/Intrinsics/Vector128.cs  | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
index 987c3c3d30671f..5f0d2d5b60822f 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
@@ -4401,8 +4401,14 @@ internal static Vector128<byte> UnpackLow(Vector128<byte> left, Vector128<byte>
             }
             else if (PackedSimd.IsSupported)
             {
-                return PackedSimd.Shuffle(left, right,
-                    Vector128.Create((byte)0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23));
+                // Compose with two ShuffleNative (PackedSimd.Swizzle, which clamps indices >= 16 to 0)
+                // plus OR. PackedSimd.Shuffle (two-vector i8x16.shuffle) would require constant lane
+                // indices and is impractical to call portably from generic code paths.
+                Vector128<byte> leftPart = Vector128.ShuffleNative(left,
+                    Vector128.Create((byte)0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, 4, 0xFF, 5, 0xFF, 6, 0xFF, 7, 0xFF));
+                Vector128<byte> rightPart = Vector128.ShuffleNative(right,
+                    Vector128.Create((byte)0xFF, 0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, 4, 0xFF, 5, 0xFF, 6, 0xFF, 7));
+                return leftPart | rightPart;
             }
             ThrowHelper.ThrowNotSupportedException();
             return default;
@@ -4424,8 +4430,11 @@ internal static Vector128<byte> UnpackHigh(Vector128<byte> left, Vector128<byte>
             }
             else if (PackedSimd.IsSupported)
             {
-                return PackedSimd.Shuffle(left, right,
-                    Vector128.Create((byte)8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31));
+                Vector128<byte> leftPart = Vector128.ShuffleNative(left,
+                    Vector128.Create((byte)8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, 12, 0xFF, 13, 0xFF, 14, 0xFF, 15, 0xFF));
+                Vector128<byte> rightPart = Vector128.ShuffleNative(right,
+                    Vector128.Create((byte)0xFF, 8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, 12, 0xFF, 13, 0xFF, 14, 0xFF, 15));
+                return leftPart | rightPart;
             }
             ThrowHelper.ThrowNotSupportedException();
             return default;

From 0e99f4a194c119b624c506e137aec3da642660cf Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Thu, 25 Jun 2026 07:36:10 -0500
Subject: [PATCH 09/11] Call PackedSimd.Swizzle directly in Vector128.Unpack
 and TeddyHelper.RightShift

Both helpers previously dispatched to PackedSimd via
Vector128.ShuffleNative, which itself has a Ssse3 -> AdvSimd.Arm64 ->
PackedSimd if/else chain. The Mono SIMD intrinsic recognizer does not
always lower that chain cleanly for less-traveled paths, surfacing as
NIY interpreter assertions and runtime startup failures.

Call PackedSimd.Swizzle (i8x16.swizzle) directly under the
PackedSimd.IsSupported branch. The semantics are identical to
ShuffleNative on Wasm (clamps indices >= 16 to 0) but the lowering
goes through a single recognized intrinsic, avoiding the dispatcher
chain.

Validated: System.Memory.Tests on browser-wasm V8 interpreter
52249/52249 (covers TeddyHelper.RightShift1/2). The original NIY
OutOfMemoryException:.ctor failure seen in System.Runtime.Tests with
the prior ShuffleNative version is gone with this change. AOT
behaviour will be re-validated by CI on push.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/System/Runtime/Intrinsics/Vector128.cs  | 17 ++++++++++-------
 .../SearchValues/Strings/Helpers/TeddyHelper.cs | 14 ++++++++------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
index 5f0d2d5b60822f..98659953164aec 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
@@ -4401,12 +4401,15 @@ internal static Vector128<byte> UnpackLow(Vector128<byte> left, Vector128<byte>
             }
             else if (PackedSimd.IsSupported)
             {
-                // Compose with two ShuffleNative (PackedSimd.Swizzle, which clamps indices >= 16 to 0)
-                // plus OR. PackedSimd.Shuffle (two-vector i8x16.shuffle) would require constant lane
-                // indices and is impractical to call portably from generic code paths.
-                Vector128<byte> leftPart = Vector128.ShuffleNative(left,
+                // Compose with two PackedSimd.Swizzle calls (clamp out-of-range to 0) plus OR.
+                // We call PackedSimd.Swizzle directly rather than Vector128.ShuffleNative because
+                // the latter goes through a Ssse3 -> AdvSimd.Arm64 -> PackedSimd dispatcher chain
+                // that the Mono SIMD intrinsic recognizer doesn't always lower cleanly.
+                // PackedSimd.Shuffle (two-vector i8x16.shuffle) requires constant lane indices
+                // and is impractical to call portably from generic code paths.
+                Vector128<byte> leftPart = PackedSimd.Swizzle(left,
                     Vector128.Create((byte)0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, 4, 0xFF, 5, 0xFF, 6, 0xFF, 7, 0xFF));
-                Vector128<byte> rightPart = Vector128.ShuffleNative(right,
+                Vector128<byte> rightPart = PackedSimd.Swizzle(right,
                     Vector128.Create((byte)0xFF, 0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, 4, 0xFF, 5, 0xFF, 6, 0xFF, 7));
                 return leftPart | rightPart;
             }
@@ -4430,9 +4433,9 @@ internal static Vector128<byte> UnpackHigh(Vector128<byte> left, Vector128<byte>
             }
             else if (PackedSimd.IsSupported)
             {
-                Vector128<byte> leftPart = Vector128.ShuffleNative(left,
+                Vector128<byte> leftPart = PackedSimd.Swizzle(left,
                     Vector128.Create((byte)8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, 12, 0xFF, 13, 0xFF, 14, 0xFF, 15, 0xFF));
-                Vector128<byte> rightPart = Vector128.ShuffleNative(right,
+                Vector128<byte> rightPart = PackedSimd.Swizzle(right,
                     Vector128.Create((byte)0xFF, 8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, 12, 0xFF, 13, 0xFF, 14, 0xFF, 15));
                 return leftPart | rightPart;
             }
diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs
index 7fae7b0fd35c50..8d6eb67922e4fb 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/TeddyHelper.cs
@@ -369,11 +369,13 @@ private static Vector128<byte> RightShift1(Vector128<byte> left, Vector128<byte>
             }
             else if (PackedSimd.IsSupported)
             {
-                // ShuffleNative lowers to PackedSimd.Swizzle which clamps out-of-range
-                // indices (>= 16) to 0, so we can compose the two halves with an OR.
-                Vector128<byte> leftPart = Vector128.ShuffleNative(left,
+                // Call PackedSimd.Swizzle directly (i8x16.swizzle) rather than through
+                // Vector128.ShuffleNative's dispatcher chain, which the Mono SIMD intrinsic
+                // recognizer doesn't always lower cleanly. Swizzle clamps out-of-range
+                // indices (>= 16) to 0 so we can compose the two halves with OR.
+                Vector128<byte> leftPart = PackedSimd.Swizzle(left,
                     Vector128.Create((byte)15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
-                Vector128<byte> rightPart = Vector128.ShuffleNative(right,
+                Vector128<byte> rightPart = PackedSimd.Swizzle(right,
                     Vector128.Create((byte)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14));
                 return leftPart | rightPart;
             }
@@ -407,9 +409,9 @@ private static Vector128<byte> RightShift2(Vector128<byte> left, Vector128<byte>
             }
             else if (PackedSimd.IsSupported)
             {
-                Vector128<byte> leftPart = Vector128.ShuffleNative(left,
+                Vector128<byte> leftPart = PackedSimd.Swizzle(left,
                     Vector128.Create((byte)14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
-                Vector128<byte> rightPart = Vector128.ShuffleNative(right,
+                Vector128<byte> rightPart = PackedSimd.Swizzle(right,
                     Vector128.Create((byte)0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13));
                 return leftPart | rightPart;
             }

From 254370263306d9e5aed3b97d66355d93a3327fcb Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Thu, 25 Jun 2026 07:47:43 -0500
Subject: [PATCH 10/11] Update Utf8Utility.Validation comment for Wasm

Reword the LoopTerminatedEarlyDueToNonAsciiData label comment to
mention that Wasm is also a little-endian-only platform reaching
this point through the PackedSimd branch added earlier in this PR.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/System/Text/Unicode/Utf8Utility.Validation.cs         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index 5388c6c68d0052..dff9af0fdcac9d 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -190,8 +190,8 @@ internal static unsafe partial class Utf8Utility
 
 #if NET
                     LoopTerminatedEarlyDueToNonAsciiData:
-                        // x86 can only be little endian, while ARM can be big or little endian
-                        // so if we reached this label we need to check both combinations are supported
+                        // x86 and Wasm can only be little endian, while ARM can be big or little endian,
+                        // so if we reached this label we need to check the LE-restricted combinations as well.
                         Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported || PackedSimd.IsSupported);
 
 

From ec32de08b80d33601e50cdefebf57b94328f45ab Mon Sep 17 00:00:00 2001
From: lewing <lewing@microsoft.com>
Date: Thu, 25 Jun 2026 08:01:36 -0500
Subject: [PATCH 11/11] Drop redundant '&& BitConverter.IsLittleEndian' from
 PackedSimd gate

Wasm is always little-endian by spec, so the BitConverter.IsLittle
Endian check on the PackedSimd.IsSupported branch is a no-op. Keep
the check on the AdvSimd.Arm64 branch where it actually matters
(NEON can be big- or little-endian on some configurations).

Mirrors how the existing Sse41.X64 branch in the same gate has no
LE check (x86-64 is also always little-endian).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/System/Text/Unicode/Utf8Utility.Transcoding.cs        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs
index 7931269a0e5c60..581d231076678f 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs
@@ -885,7 +885,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
 #if NET
             Vector128<short> nonAsciiUtf16DataMask;
 
-            if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || (PackedSimd.IsSupported && BitConverter.IsLittleEndian))
+            if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || PackedSimd.IsSupported)
             {
                 nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char
             }
@@ -945,7 +945,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                     uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
 
 #if NET
-                    if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || (PackedSimd.IsSupported && BitConverter.IsLittleEndian))
+                    if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || PackedSimd.IsSupported)
                     {
                         // Try reading and writing 8 elements per iteration.
                         uint maxIters = minElementsRemaining / 8;