WordPress · sirreal · Jun 12, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 23, 2026
diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php
@@ -367,34 +367,60 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
 
 		$after_name = $name_at + $name_length;
 
-		// If the match ended with a semicolon then it should always be decoded.
-		if ( ';' === $text[ $name_at + $name_length - 1 ] ) {
-			$match_byte_length = $after_name - $at;
-			return $replacement;
-		}
-
-		/*
-		 * At this point though there's a match for an entry in the named
-		 * character reference table but the match doesn't end in `;`.
-		 * It may be allowed if it's followed by something unambiguous.
+		/**
+		 * For historical reasons, a matched named character reference is left as literal
+		 * text (its decoded replacement is not used) when all of the following hold:
+		 *
+		 * 1. It was matched in attribute context.
+		 * 2. The match does not end in U+003B SEMICOLON (;) — i.e. it is one of the
+		 *    legacy forms recognized without a trailing semicolon.
+		 * 3. The next input character is U+003D EQUALS SIGN (=) or an ASCII alphanumeric.
+		 *
+		 * Some illustrative examples follow. Note that both `not` and `not;` appear in the
+		 * named character references list. References start with `&` and typically end with
+		 * `;`, but the legacy forms are recognized without one.
+		 *
+		 * - In _data context_, "&notme" is decoded to "¬me": condition 1 fails (not an
+		 *   attribute), so the reference is decoded.
+		 * - In _attribute context_, "&not;me" is decoded to "¬me": the longest match is
+		 *   "not;", which ends in a semicolon, so condition 2 fails.
+		 * - In _attribute context_, "&not己" is decoded to "¬己": the following character
+		 *   "己" is a letter but not an ASCII alphanumeric (nor "="), so condition 3 fails.
+		 * - In _attribute context_, "&not" is decoded to "¬": there is no next input
+		 *   character, so condition 3 fails.
+		 * - In _attribute context_, "&not=me" is left as the literal text "&not=me": all
+		 *   three conditions hold.
+		 * - In _attribute context_, "&notme" is left as the literal text "&notme": all
+		 *   three conditions hold.
+		 *
+		 * Without these special rules, ordinary URL query strings could have surprising
+		 * replacements applied. Consider:
+		 *
+		 *     <a href="/?random&degree&gt=0&lt=360&not=90">
+		 *
+		 * These special rules preserve the literal attribute value:
+		 * `/?random&degree&gt=0&lt=360&not=90`. Without them, the value would be decoded
+		 * as `/?random°ree>=0<=360¬=90`, likely not the intended value.
+		 *
+		 * (Authors should not rely on this. Escaping the example as
+		 * `/?random&amp;degree&amp;gt=0&amp;lt=360&amp;not=90` produces the intended
+		 * value regardless of the following character.)
+		 *
+		 * @see https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+		 * @see https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
 		 */
-		$ambiguous_follower = (
-			$after_name < $length &&
-			$name_at < $length &&
-			(
-				ctype_alnum( $text[ $after_name ] ) ||
-				'=' === $text[ $after_name ]
-			)
-		);
-
-		// It's non-ambiguous, safe to leave it in.
-		if ( ! $ambiguous_follower ) {
+		if ( 'attribute' !== $context || ';' === $text[ $after_name - 1 ] || $after_name >= $length ) {
 			$match_byte_length = $after_name - $at;
 			return $replacement;
 		}
 
-		// It's ambiguous, which isn't allowed inside attributes.
-		if ( 'attribute' === $context ) {
+		$follower_byte = ord( $text[ $after_name ] );
+		if (
+			0x3D === $follower_byte || //                              EQUALS SIGN
+			( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || // ASCII digits 0-9
+			( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || // ASCII upper alpha A-Z
+			( $follower_byte >= 0x61 && $follower_byte <= 0x7A )    // ASCII lower alpha a-z
+		) {
 			return null;
 		}
 

diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php
@@ -12,6 +12,51 @@
  * @coversDefaultClass WP_HTML_Decoder
  */
 class Tests_HtmlApi_WpHtmlDecoder extends WP_UnitTestCase {
+	/**
+	 * Original LC_CTYPE locale.
+	 *
+	 * @var string|null
+	 */
+	private static ?string $original_lc_ctype = null;
+
+	/**
+	 * Locale where ctype_alnum() classifies high-bit bytes as alphanumeric.
+	 *
+	 * @var string|null
+	 */
+	private static ?string $problematic_lc_ctype = null;
+
+	public static function set_up_before_class() {
+		parent::set_up_before_class();
+
+		self::$original_lc_ctype = setlocale( LC_CTYPE, 0 );
+
+		// Find a locale where ctype_alnum() classifies high-bit bytes as alphanumeric.
+		$locale_candidates = array(
+			'C.UTF-8',
+			'C.utf8',
+			'en_US.UTF-8',
+			'en_US.utf8',
+			'en_GB.UTF-8',
+			'en_GB.utf8',
+		);
+		foreach ( $locale_candidates as $locale ) {
+			$candidate_locale = setlocale( LC_CTYPE, $locale );
+
+			if ( false !== $candidate_locale && ctype_alnum( "\xC2" ) ) {
+				self::$problematic_lc_ctype = $candidate_locale;
+				break;
+			}
+		}
+
+		setlocale( LC_CTYPE, self::$original_lc_ctype );
+	}
+
+	public function tear_down() {
+		setlocale( LC_CTYPE, self::$original_lc_ctype );
+		parent::tear_down();
+	}
+
 	/**
 	 * Ensures proper decoding of edge cases.
 	 *
@@ -61,6 +106,115 @@ static function ( int $errno, string $errstr ) use ( &$errors ) {
 		$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
 	}
 
+	/**
+	 * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes.
+	 *
+	 * @dataProvider data_semicolonless_attribute_behaviors
+	 *
+	 * @ticket 65372
+	 */
+	public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower( string $encoded_attribute_value, string $expected, string $expected_decode, int $expected_byte_length ): void {
+		if ( null !== self::$problematic_lc_ctype ) {
+			setlocale( LC_CTYPE, self::$problematic_lc_ctype );
+		}
+
+		$this->assertSame(
+			$expected,
+			WP_HTML_Decoder::decode_attribute( $encoded_attribute_value ),
+			'Failed to decode the full attribute value as expected.'
+		);
+
+		$match_byte_length = null;
+		$this->assertSame(
+			$expected_decode,
+			WP_HTML_Decoder::read_character_reference( 'attribute', $encoded_attribute_value, 0, $match_byte_length ),
+			'Failed to decode the character reference as expected.'
+		);
+		$this->assertSame( $expected_byte_length, $match_byte_length, 'Failed to produce expected byte length.' );
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * Attribute values encoded with character references including followers that are
+	 * treated as alphanumerics by `ctype_alnum()` on some systems, but should never
+	 * be recognized as ASCII Alphanumerics according to the HTML standards.
+	 *
+	 * @see https://html.spec.whatwg.org/#named-character-reference-state
+	 *
+	 * @return array<array{
+	 *   string, // Encoded attribute value.
+	 *   string, // Expected full decode.
+	 *   string, // Expected character decode.
+	 *   int,    // Replaced character reference byte length.
+	 * }> Test cases.
+	 */
+	public static function data_semicolonless_attribute_behaviors(): array {
+		return array(
+			array( '&copy¯\_(ツ)_/¯', '©¯\_(ツ)_/¯', '©', 5 ),
+			array( '&notಠ_ಠ', '¬ಠ_ಠ', '¬', 4 ),
+			array( '&nbsp£20', "\u{00A0}£20", "\u{00A0}", 5 ),
+			array( '&nbsp🎉', "\u{00A0}🎉", "\u{00A0}", 5 ),
+			array( '&reg™', '®™', '®', 4 ),
+		);
+	}
+
+	/**
+	 * Ensures ambiguous ampersand is recognized with trailing ASCII alphanumerics.
+	 *
+	 * @dataProvider data_semicolonless_attribute_character_reference_no_decode_followers
+	 *
+	 * @ticket 65372
+	 *
+	 * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower.
+	 */
+	public function test_ascii_alphanumeric_attribute_follower_is_ambiguous( string $raw_attribute ): void {
+		$this->assertSame(
+			$raw_attribute,
+			WP_HTML_Decoder::decode_attribute( $raw_attribute ),
+			'Should not have decoded an ambiguous semicolonless legacy reference.'
+		);
+
+		$match_byte_length = 'sentinel';
+		$this->assertNull(
+			WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ),
+			'Should not have matched an ambiguous semicolonless legacy reference.'
+		);
+		$this->assertSame( 'sentinel', $match_byte_length );
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * HTML character references with followers that trigger the literal flush behavior
+	 * when parsing attribute values. HTML defines this as `=` or an ASCII alphanumeric character.
+	 *
+	 * > An ASCII alphanumeric is an ASCII digit or ASCII alpha.
+	 * > An ASCII alpha is an ASCII upper alpha or ASCII lower alpha.
+	 *
+	 * @see https://html.spec.whatwg.org/#named-character-reference-state
+	 *
+	 * @return Generator<string, array{ string }> Test cases.
+	 */
+	public static function data_semicolonless_attribute_character_reference_no_decode_followers(): Generator {
+		yield "Equals sign follower '='" => array( '&Aacute=' );
+		// > An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive.
+		for ( $i = 0x30; $i <= 0x39; $i++ ) {
+			$char = chr( $i );
+			yield "ASCII digit follower '{$char}'" => array( "&Aacute{$char}" );
+		}
+		// > An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive.
+		for ( $i = 0x41; $i <= 0x5A; $i++ ) {
+			$char = chr( $i );
+			yield "ASCII upper alpha follower '{$char}'" => array( "&Aacute{$char}" );
+		}
+		// > An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive.
+		for ( $i = 0x61; $i <= 0x7A; $i++ ) {
+			$char = chr( $i );
+			yield "ASCII lower alpha follower '{$char}'" => array( "&Aacute{$char}" );
+		}
+	}
+
 	/**
 	 * Ensures proper detection of attribute prefixes ignoring ASCII case.
 	 *