From f92da80aa6be860b1a98981eb24e3426c23b3e3c Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 12 Jun 2026 22:47:44 +0200 Subject: [PATCH 01/16] Fix attribute legacy reference follower checks --- .../html-api/class-wp-html-decoder.php | 10 ++- .../phpunit/tests/html-api/wpHtmlDecoder.php | 74 +++++++++++++++++++ 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index d14009d3d9fb8..e3da947bf952f 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -378,12 +378,14 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * character reference table but the match doesn't end in `;`. * It may be allowed if it's followed by something unambiguous. */ + $follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null; $ambiguous_follower = ( - $after_name < $length && - $name_at < $length && + null !== $follower_byte && ( - ctype_alnum( $text[ $after_name ] ) || - '=' === $text[ $after_name ] + ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || + ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || + ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) || + 0x3D === $follower_byte ) ); diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 97954f4eb3e30..2d46ee39753be 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -61,6 +61,80 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); } + /** + * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes. + */ + public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() { + $previous_locale = setlocale( LC_CTYPE, 0 ); + $affected_locale = setlocale( LC_CTYPE, 'C.UTF-8', 'en_US.UTF-8', 'de_DE.UTF-8', 'fr_FR.UTF-8' ); + + if ( false === $affected_locale || ! ctype_alnum( "\xC2" ) ) { + if ( false !== $previous_locale ) { + setlocale( LC_CTYPE, $previous_locale ); + } + + $this->markTestSkipped( 'Requires an LC_CTYPE locale where ctype_alnum() classifies high-bit bytes as alphanumeric.' ); + } + + $raw_attribute = "Á\xC2\x80"; + + try { + $this->assertSame( + "\xC3\x81\xC2\x80", + WP_HTML_Decoder::decode_attribute( $raw_attribute ), + 'Should have decoded the semicolonless legacy reference before a multibyte follower.' + ); + + $match_byte_length = null; + $this->assertSame( + "\xC3\x81", + WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), + 'Should have matched the semicolonless legacy reference before a multibyte follower.' + ); + $this->assertSame( strlen( 'Á' ), $match_byte_length ); + } finally { + if ( false !== $previous_locale ) { + setlocale( LC_CTYPE, $previous_locale ); + } + } + } + + /** + * Ensures semicolonless legacy references remain ambiguous before ASCII alnum or equals. + * + * @dataProvider data_ambiguous_ascii_attribute_followers + * + * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower. + */ + public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous( $raw_attribute ) { + $this->assertSame( + $raw_attribute, + WP_HTML_Decoder::decode_attribute( $raw_attribute ), + 'Should not have decoded an ambiguous semicolonless legacy reference.' + ); + + $match_byte_length = 'sentinel'; + $this->assertNull( + WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), + 'Should not have matched an ambiguous semicolonless legacy reference.' + ); + $this->assertSame( 'sentinel', $match_byte_length ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_ambiguous_ascii_attribute_followers() { + return array( + 'ASCII digit' => array( 'Á0' ), + 'ASCII uppercase alpha' => array( 'ÁA' ), + 'ASCII lowercase alpha' => array( 'Áa' ), + 'equals' => array( 'Á=' ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. * From 8bb66dcf98751c099906c41a42a07c0a319b30de Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Mon, 15 Jun 2026 11:15:25 +0200 Subject: [PATCH 02/16] Fix coding standards for decoder legacy follower checks --- src/wp-includes/html-api/class-wp-html-decoder.php | 2 +- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index e3da947bf952f..4c3daadb2ac4e 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -378,7 +378,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * character reference table but the match doesn't end in `;`. * It may be allowed if it's followed by something unambiguous. */ - $follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null; + $follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null; $ambiguous_follower = ( null !== $follower_byte && ( diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 2d46ee39753be..1bf4b700b4983 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -128,10 +128,10 @@ public function test_semicolonless_legacy_reference_before_ascii_attribute_follo */ public static function data_ambiguous_ascii_attribute_followers() { return array( - 'ASCII digit' => array( 'Á0' ), + 'ASCII digit' => array( 'Á0' ), 'ASCII uppercase alpha' => array( 'ÁA' ), 'ASCII lowercase alpha' => array( 'Áa' ), - 'equals' => array( 'Á=' ), + 'equals' => array( 'Á=' ), ); } From 2187e3313fb00f7b0a70af9843a2c90571a06172 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 12:55:19 +0200 Subject: [PATCH 03/16] Add ticket number --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 1bf4b700b4983..34a16a59fcacb 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -63,6 +63,8 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { /** * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes. + * + * @ticket 65372 */ public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() { $previous_locale = setlocale( LC_CTYPE, 0 ); @@ -104,6 +106,8 @@ public function test_semicolonless_legacy_reference_before_multibyte_attribute_f * * @dataProvider data_ambiguous_ascii_attribute_followers * + * @ticket 65372 + * * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower. */ public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous( $raw_attribute ) { From e6e98c7503afde16456df28e345e3f4b2fa11ec2 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 13:01:56 +0200 Subject: [PATCH 04/16] Tests: Broaden UTF-8 locale candidates for HTML decoder --- .../phpunit/tests/html-api/wpHtmlDecoder.php | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 34a16a59fcacb..9111b447ed4b7 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -68,9 +68,36 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { */ public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() { $previous_locale = setlocale( LC_CTYPE, 0 ); - $affected_locale = setlocale( LC_CTYPE, 'C.UTF-8', 'en_US.UTF-8', 'de_DE.UTF-8', 'fr_FR.UTF-8' ); - if ( false === $affected_locale || ! ctype_alnum( "\xC2" ) ) { + $locale_candidates = array( + 'C.UTF-8', + 'C.utf8', + 'en_US.UTF-8', + 'en_US.utf8', + 'en_GB.UTF-8', + 'en_GB.utf8', + 'en_AU.UTF-8', + 'en_AU.utf8', + 'en_CA.UTF-8', + 'en_CA.utf8', + 'en_NZ.UTF-8', + 'en_NZ.utf8', + 'en_IE.UTF-8', + 'en_IE.utf8', + ); + + $affected_locale = false; + + foreach ( $locale_candidates as $locale ) { + $candidate_locale = setlocale( LC_CTYPE, $locale ); + + if ( false !== $candidate_locale && ctype_alnum( "\xC2" ) ) { + $affected_locale = $candidate_locale; + break; + } + } + + if ( false === $affected_locale ) { if ( false !== $previous_locale ) { setlocale( LC_CTYPE, $previous_locale ); } From f9c7d741cc505ae06ea1589ee965a49eda2eb5a4 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 13:42:44 +0200 Subject: [PATCH 05/16] Improve tests --- .../phpunit/tests/html-api/wpHtmlDecoder.php | 109 +++++++++++------- 1 file changed, 69 insertions(+), 40 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 9111b447ed4b7..b5807985fa335 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -64,11 +64,14 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { /** * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes. * + * @dataProvider data_semicolonless_attribute_behaviors + * * @ticket 65372 */ - public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() { + public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower( string $encoded_attribute_value, string $expected, int $expected_byte_length ): void { $previous_locale = setlocale( LC_CTYPE, 0 ); + // Find a locale where ctype_alnum() classifies high-bit bytes as alphanumeric. $locale_candidates = array( 'C.UTF-8', 'C.utf8', @@ -76,18 +79,9 @@ public function test_semicolonless_legacy_reference_before_multibyte_attribute_f 'en_US.utf8', 'en_GB.UTF-8', 'en_GB.utf8', - 'en_AU.UTF-8', - 'en_AU.utf8', - 'en_CA.UTF-8', - 'en_CA.utf8', - 'en_NZ.UTF-8', - 'en_NZ.utf8', - 'en_IE.UTF-8', - 'en_IE.utf8', ); $affected_locale = false; - foreach ( $locale_candidates as $locale ) { $candidate_locale = setlocale( LC_CTYPE, $locale ); @@ -105,39 +99,56 @@ public function test_semicolonless_legacy_reference_before_multibyte_attribute_f $this->markTestSkipped( 'Requires an LC_CTYPE locale where ctype_alnum() classifies high-bit bytes as alphanumeric.' ); } - $raw_attribute = "Á\xC2\x80"; + $this->assertSame( + $expected, + WP_HTML_Decoder::decode_attribute( $encoded_attribute_value ), + 'Should have decoded the semicolonless legacy reference before a multibyte follower.' + ); - try { - $this->assertSame( - "\xC3\x81\xC2\x80", - WP_HTML_Decoder::decode_attribute( $raw_attribute ), - 'Should have decoded the semicolonless legacy reference before a multibyte follower.' - ); + $match_byte_length = null; + $this->assertSame( + $encoded_attribute_value, + WP_HTML_Decoder::read_character_reference( 'attribute', $encoded_attribute_value, 0, $match_byte_length ), + 'Should have matched the semicolonless legacy reference before a multibyte follower.' + ); + $this->assertSame( $expected_byte_length, $match_byte_length ); + } - $match_byte_length = null; - $this->assertSame( - "\xC3\x81", - WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), - 'Should have matched the semicolonless legacy reference before a multibyte follower.' - ); - $this->assertSame( strlen( 'Á' ), $match_byte_length ); - } finally { - if ( false !== $previous_locale ) { - setlocale( LC_CTYPE, $previous_locale ); - } - } + /** + * Data provider. + * + * Attribute values encoded with character references including followers that are + * treated as alphanumerics by `ctype_alnum()` on some systems, but should never + * be recognized as ASCII Alphanumerics according the the HTML standards. + * + * @see https://html.spec.whatwg.org/#named-character-reference-state + * + * @return Array Test cases. + */ + public static function data_semicolonless_attribute_behaviors(): array { + return array( + array( '©¯\_(ツ)_/¯', '©¯\_(ツ)_/¯', 5 ), + array( '¬ಠ_ಠ', '¬ಠ_ಠ', 4 ), + array( ' £20', "\xA0£20", 5 ), + array( ' 🎉', "\xA0🎉", 5 ), + array( '®™', '®™', 4 ), + ); } /** - * Ensures semicolonless legacy references remain ambiguous before ASCII alnum or equals. + * Ensures ambiguous ampersand is recognized with trailing ASCII alphanumerics. * - * @dataProvider data_ambiguous_ascii_attribute_followers + * @dataProvider data_semicolonless_attribute_character_reference_no_decode_followers * * @ticket 65372 * * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower. */ - public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous( $raw_attribute ) { + public function test_ascii_alphanumeric_attribute_follower_is_ambiguous( string $raw_attribute ): void { $this->assertSame( $raw_attribute, WP_HTML_Decoder::decode_attribute( $raw_attribute ), @@ -155,15 +166,33 @@ public function test_semicolonless_legacy_reference_before_ascii_attribute_follo /** * Data provider. * - * @return array[]. + * HTML character references with followers that trigger the literal flush behavior + * when parsing attribute values. HTML defines this as `"` or an ASCII alphanumeric character. + * + * > An ASCII alphanumeric is an ASCII digit or ASCII alpha. + * > An ASCII alpha is an ASCII upper alpha or ASCII lower alpha. + * + * @see https://html.spec.whatwg.org/#named-character-reference-state + * + * @return Generator Test cases. */ - public static function data_ambiguous_ascii_attribute_followers() { - return array( - 'ASCII digit' => array( 'Á0' ), - 'ASCII uppercase alpha' => array( 'ÁA' ), - 'ASCII lowercase alpha' => array( 'Áa' ), - 'equals' => array( 'Á=' ), - ); + public static function data_semicolonless_attribute_character_reference_no_decode_followers(): Generator { + yield "Trialing '='" => array( 'Á=' ); + // > An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive. + for ( $i = 0x30; $i <= 0x39; $i++ ) { + $char = chr( $i ); + yield "ASCII digit follwer '{$char}'" => array( "Á{$char}" ); + } + // > An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive. + for ( $i = 0x41; $i <= 0x5A; $i++ ) { + $char = chr( $i ); + yield "ASCII upper alpha follwer '{$char}'" => array( "Á{$char}" ); + } + // > An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive. + for ( $i = 0x61; $i <= 0x7A; $i++ ) { + $char = chr( $i ); + yield "ASCII lower alpha follwer '{$char}'" => array( "Á{$char}" ); + } } /** From ac0d8428ae1e6ea20bd7b9f20fbf85e0451c70fc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 13:51:15 +0200 Subject: [PATCH 06/16] Improve test logic --- .../phpunit/tests/html-api/wpHtmlDecoder.php | 90 +++++++++++++------ 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index b5807985fa335..ba940b4339ed1 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -12,6 +12,68 @@ * @coversDefaultClass WP_HTML_Decoder */ class Tests_HtmlApi_WpHtmlDecoder extends WP_UnitTestCase { + /** + * Previous LC_CTYPE locale. + * + * @var string|false + */ + private static $previous_lc_ctype_locale = false; + + /** + * Locale where ctype_alnum() classifies high-bit bytes as alphanumeric. + * + * @var string|null + */ + private static $problematic_lc_ctype_locale = null; + + /** + * Runs the routine before setting up all tests. + */ + public static function set_up_before_class() { + parent::set_up_before_class(); + + self::$previous_lc_ctype_locale = setlocale( LC_CTYPE, 0 ); + + // Find a locale where ctype_alnum() classifies high-bit bytes as alphanumeric. + $locale_candidates = array( + 'C.UTF-8', + 'C.utf8', + 'en_US.UTF-8', + 'en_US.utf8', + 'en_GB.UTF-8', + 'en_GB.utf8', + ); + + foreach ( $locale_candidates as $locale ) { + $candidate_locale = setlocale( LC_CTYPE, $locale ); + + if ( false !== $candidate_locale && ctype_alnum( "\xC2" ) ) { + self::$problematic_lc_ctype_locale = $candidate_locale; + break; + } + } + + if ( null === self::$problematic_lc_ctype_locale ) { + if ( false !== self::$previous_lc_ctype_locale ) { + setlocale( LC_CTYPE, self::$previous_lc_ctype_locale ); + } + } + } + + /** + * Runs the routine after all tests have been run. + */ + public static function tear_down_after_class() { + if ( false !== self::$previous_lc_ctype_locale ) { + setlocale( LC_CTYPE, self::$previous_lc_ctype_locale ); + } + + self::$previous_lc_ctype_locale = false; + self::$problematic_lc_ctype_locale = null; + + parent::tear_down_after_class(); + } + /** * Ensures proper decoding of edge cases. * @@ -69,33 +131,7 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { * @ticket 65372 */ public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower( string $encoded_attribute_value, string $expected, int $expected_byte_length ): void { - $previous_locale = setlocale( LC_CTYPE, 0 ); - - // Find a locale where ctype_alnum() classifies high-bit bytes as alphanumeric. - $locale_candidates = array( - 'C.UTF-8', - 'C.utf8', - 'en_US.UTF-8', - 'en_US.utf8', - 'en_GB.UTF-8', - 'en_GB.utf8', - ); - - $affected_locale = false; - foreach ( $locale_candidates as $locale ) { - $candidate_locale = setlocale( LC_CTYPE, $locale ); - - if ( false !== $candidate_locale && ctype_alnum( "\xC2" ) ) { - $affected_locale = $candidate_locale; - break; - } - } - - if ( false === $affected_locale ) { - if ( false !== $previous_locale ) { - setlocale( LC_CTYPE, $previous_locale ); - } - + if ( null === self::$problematic_lc_ctype_locale ) { $this->markTestSkipped( 'Requires an LC_CTYPE locale where ctype_alnum() classifies high-bit bytes as alphanumeric.' ); } From bd5566e16f439da3829392d17aa34243f1927948 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 14:05:31 +0200 Subject: [PATCH 07/16] Test fixups --- .../phpunit/tests/html-api/wpHtmlDecoder.php | 66 +++++++------------ 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index ba940b4339ed1..9acd17b47a951 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -13,26 +13,23 @@ */ class Tests_HtmlApi_WpHtmlDecoder extends WP_UnitTestCase { /** - * Previous LC_CTYPE locale. + * Original LC_CTYPE locale. * - * @var string|false + * @var string|null */ - private static $previous_lc_ctype_locale = false; + private static ?string $original_lc_ctype = null; /** * Locale where ctype_alnum() classifies high-bit bytes as alphanumeric. * * @var string|null */ - private static $problematic_lc_ctype_locale = null; + private static ?string $problematic_lc_ctype = null; - /** - * Runs the routine before setting up all tests. - */ public static function set_up_before_class() { parent::set_up_before_class(); - self::$previous_lc_ctype_locale = setlocale( LC_CTYPE, 0 ); + self::$original_lc_ctype = setlocale( LC_CTYPE, 0 ); // Find a locale where ctype_alnum() classifies high-bit bytes as alphanumeric. $locale_candidates = array( @@ -43,35 +40,21 @@ public static function set_up_before_class() { 'en_GB.UTF-8', 'en_GB.utf8', ); - foreach ( $locale_candidates as $locale ) { $candidate_locale = setlocale( LC_CTYPE, $locale ); if ( false !== $candidate_locale && ctype_alnum( "\xC2" ) ) { - self::$problematic_lc_ctype_locale = $candidate_locale; + self::$problematic_lc_ctype = $candidate_locale; break; } } - if ( null === self::$problematic_lc_ctype_locale ) { - if ( false !== self::$previous_lc_ctype_locale ) { - setlocale( LC_CTYPE, self::$previous_lc_ctype_locale ); - } - } + setlocale( LC_CTYPE, self::$original_lc_ctype ); } - /** - * Runs the routine after all tests have been run. - */ - public static function tear_down_after_class() { - if ( false !== self::$previous_lc_ctype_locale ) { - setlocale( LC_CTYPE, self::$previous_lc_ctype_locale ); - } - - self::$previous_lc_ctype_locale = false; - self::$problematic_lc_ctype_locale = null; - - parent::tear_down_after_class(); + public function tear_down() { + setlocale( LC_CTYPE, self::$original_lc_ctype ); + parent::tear_down(); } /** @@ -130,24 +113,24 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { * * @ticket 65372 */ - public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower( string $encoded_attribute_value, string $expected, int $expected_byte_length ): void { - if ( null === self::$problematic_lc_ctype_locale ) { - $this->markTestSkipped( 'Requires an LC_CTYPE locale where ctype_alnum() classifies high-bit bytes as alphanumeric.' ); + public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower( string $encoded_attribute_value, string $expected, string $expected_decode, int $expected_byte_length ): void { + if ( null !== self::$problematic_lc_ctype ) { + setlocale( LC_CTYPE, self::$problematic_lc_ctype ); } $this->assertSame( $expected, WP_HTML_Decoder::decode_attribute( $encoded_attribute_value ), - 'Should have decoded the semicolonless legacy reference before a multibyte follower.' + 'Failed to decode the full attribute value as expected.' ); $match_byte_length = null; $this->assertSame( - $encoded_attribute_value, + $expected_decode, WP_HTML_Decoder::read_character_reference( 'attribute', $encoded_attribute_value, 0, $match_byte_length ), - 'Should have matched the semicolonless legacy reference before a multibyte follower.' + 'Failed to decode the character reference as expected.' ); - $this->assertSame( $expected_byte_length, $match_byte_length ); + $this->assertSame( $expected_byte_length, $match_byte_length, 'Failed to produce expected byte length.' ); } /** @@ -161,17 +144,18 @@ public function test_semicolonless_legacy_reference_before_multibyte_attribute_f * * @return Array Test cases. */ public static function data_semicolonless_attribute_behaviors(): array { return array( - array( '©¯\_(ツ)_/¯', '©¯\_(ツ)_/¯', 5 ), - array( '¬ಠ_ಠ', '¬ಠ_ಠ', 4 ), - array( ' £20', "\xA0£20", 5 ), - array( ' 🎉', "\xA0🎉", 5 ), - array( '®™', '®™', 4 ), + array( '©¯\_(ツ)_/¯', '©¯\_(ツ)_/¯', '©', 5 ), + array( '¬ಠ_ಠ', '¬ಠ_ಠ', '¬', 4 ), + array( ' £20', "\u{00A0}£20", "\u{00A0}", 5 ), + array( ' 🎉', "\u{00A0}🎉", "\u{00A0}", 5 ), + array( '®™', '®™', '®', 4 ), ); } From 65e393721b085a7d53a869ce1f68a425f10d35f2 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 15:38:18 +0200 Subject: [PATCH 08/16] Fix test language --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 9acd17b47a951..dcdd0fd78d9ab 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -197,21 +197,21 @@ public function test_ascii_alphanumeric_attribute_follower_is_ambiguous( string * @return Generator Test cases. */ public static function data_semicolonless_attribute_character_reference_no_decode_followers(): Generator { - yield "Trialing '='" => array( 'Á=' ); + yield "Equals sign follower '='" => array( 'Á=' ); // > An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive. for ( $i = 0x30; $i <= 0x39; $i++ ) { $char = chr( $i ); - yield "ASCII digit follwer '{$char}'" => array( "Á{$char}" ); + yield "ASCII digit follower '{$char}'" => array( "Á{$char}" ); } // > An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive. for ( $i = 0x41; $i <= 0x5A; $i++ ) { $char = chr( $i ); - yield "ASCII upper alpha follwer '{$char}'" => array( "Á{$char}" ); + yield "ASCII upper alpha follower {$char}'" => array( "Á{$char}" ); } // > An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive. for ( $i = 0x61; $i <= 0x7A; $i++ ) { $char = chr( $i ); - yield "ASCII lower alpha follwer '{$char}'" => array( "Á{$char}" ); + yield "ASCII lower alpha follower '{$char}'" => array( "Á{$char}" ); } } From cb3b277e6af3b27581b41b48f983c74a27cf64cc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 16:02:37 +0200 Subject: [PATCH 09/16] Fix test description --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index dcdd0fd78d9ab..c7628b21f88ff 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -187,7 +187,7 @@ public function test_ascii_alphanumeric_attribute_follower_is_ambiguous( string * Data provider. * * HTML character references with followers that trigger the literal flush behavior - * when parsing attribute values. HTML defines this as `"` or an ASCII alphanumeric character. + * when parsing attribute values. HTML defines this as `=` or an ASCII alphanumeric character. * * > An ASCII alphanumeric is an ASCII digit or ASCII alpha. * > An ASCII alpha is an ASCII upper alpha or ASCII lower alpha. From deccaae9e466a5bd13661adf5c666297c88367ea Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 16:12:14 +0200 Subject: [PATCH 10/16] Rework decoding bail to match spec, improve perf and clarity --- .../html-api/class-wp-html-decoder.php | 39 +++++++------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 856363a56fb59..c25648e37bc81 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -367,36 +367,27 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat $after_name = $name_at + $name_length; - // If the match ended with a semicolon then it should always be decoded. - if ( ';' === $text[ $name_at + $name_length - 1 ] ) { - $match_byte_length = $after_name - $at; - return $replacement; - } - /* - * At this point though there's a match for an entry in the named - * character reference table but the match doesn't end in `;`. - * It may be allowed if it's followed by something unambiguous. + * Named character references are NOT decoded if + * - If the character reference was consumed as part of an attribute + * - AND the last character matched is not a U+003B SEMICOLON character (;) + * - AND the next input character is either a U+003D EQUALS SIGN character (=) + * or an ASCII alphanumeric + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state */ - $follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null; - $ambiguous_follower = ( - null !== $follower_byte && - ( - ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || - ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || - ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) || - 0x3D === $follower_byte - ) - ); - - // It's non-ambiguous, safe to leave it in. - if ( ! $ambiguous_follower ) { + if ( 'attribute' !== $context || ';' === $text[ $after_name - 1 ] || $after_name >= $length ) { $match_byte_length = $after_name - $at; return $replacement; } - // It's ambiguous, which isn't allowed inside attributes. - if ( 'attribute' === $context ) { + $follower_byte = ord( $text[ $after_name ] ); + if ( + 0x3D === $follower_byte || // U+003D EQUALS SIGN + ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || // ASCII digits 0-9 + ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || // ASCII upper alpha A-Z + ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) // ASCII lower alpha a-z + ) { return null; } From 63a2fc16a2ef744c4b4f58fc1a491f13622920f9 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 16:14:04 +0200 Subject: [PATCH 11/16] clean up language --- src/wp-includes/html-api/class-wp-html-decoder.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index c25648e37bc81..2c4413d84da4b 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -369,7 +369,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat /* * Named character references are NOT decoded if - * - If the character reference was consumed as part of an attribute + * - the character reference was consumed as part of an attribute * - AND the last character matched is not a U+003B SEMICOLON character (;) * - AND the next input character is either a U+003D EQUALS SIGN character (=) * or an ASCII alphanumeric From e0d7a4584fe1f19f1ba776790c5b16bc879a6353 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 16:15:15 +0200 Subject: [PATCH 12/16] Improve comment --- src/wp-includes/html-api/class-wp-html-decoder.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 2c4413d84da4b..42bcf260f588a 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -383,7 +383,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat $follower_byte = ord( $text[ $after_name ] ); if ( - 0x3D === $follower_byte || // U+003D EQUALS SIGN + 0x3D === $follower_byte || // EQUALS SIGN ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || // ASCII digits 0-9 ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || // ASCII upper alpha A-Z ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) // ASCII lower alpha a-z From a2a93577d198fcf983667a5713f01e97d68932b6 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 16:18:16 +0200 Subject: [PATCH 13/16] Tighten up spec --- src/wp-includes/html-api/class-wp-html-decoder.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 42bcf260f588a..62ce0c67ba48a 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -368,10 +368,10 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat $after_name = $name_at + $name_length; /* - * Named character references are NOT decoded if + * A named character reference match is not decoded when all following conditions are true: * - the character reference was consumed as part of an attribute - * - AND the last character matched is not a U+003B SEMICOLON character (;) - * - AND the next input character is either a U+003D EQUALS SIGN character (=) + * - the last character matched is not a U+003B SEMICOLON character (;) + * - the next input character is either a U+003D EQUALS SIGN character (=) * or an ASCII alphanumeric * * @see https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state From 998be4198192619ab42bf02f75d48a45f4f137fe Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 16:21:44 +0200 Subject: [PATCH 14/16] Fix data provider test name Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index c7628b21f88ff..9813652939aba 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -206,7 +206,7 @@ public static function data_semicolonless_attribute_character_reference_no_decod // > An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive. for ( $i = 0x41; $i <= 0x5A; $i++ ) { $char = chr( $i ); - yield "ASCII upper alpha follower {$char}'" => array( "Á{$char}" ); + yield "ASCII upper alpha follower '{$char}'" => array( "Á{$char}" ); } // > An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive. for ( $i = 0x61; $i <= 0x7A; $i++ ) { From 26a8d10944b4256c7f9997a211ced193d60cc06d Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 23 Jun 2026 16:24:13 +0200 Subject: [PATCH 15/16] Fix tests phpdoc Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 9813652939aba..9c58addcdad9c 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -138,11 +138,11 @@ public function test_semicolonless_legacy_reference_before_multibyte_attribute_f * * Attribute values encoded with character references including followers that are * treated as alphanumerics by `ctype_alnum()` on some systems, but should never - * be recognized as ASCII Alphanumerics according the the HTML standards. + * be recognized as ASCII Alphanumerics according to the HTML standards. * * @see https://html.spec.whatwg.org/#named-character-reference-state * - * @return Array Date: Tue, 23 Jun 2026 16:50:49 +0200 Subject: [PATCH 16/16] Revert ctype_alnum() fix --- .../html-api/class-wp-html-decoder.php | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 62ce0c67ba48a..b6c240bdcff5f 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -367,27 +367,34 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat $after_name = $name_at + $name_length; + // If the match ended with a semicolon then it should always be decoded. + if ( ';' === $text[ $name_at + $name_length - 1 ] ) { + $match_byte_length = $after_name - $at; + return $replacement; + } + /* - * A named character reference match is not decoded when all following conditions are true: - * - the character reference was consumed as part of an attribute - * - the last character matched is not a U+003B SEMICOLON character (;) - * - the next input character is either a U+003D EQUALS SIGN character (=) - * or an ASCII alphanumeric - * - * @see https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state + * At this point though there's a match for an entry in the named + * character reference table but the match doesn't end in `;`. + * It may be allowed if it's followed by something unambiguous. */ - if ( 'attribute' !== $context || ';' === $text[ $after_name - 1 ] || $after_name >= $length ) { + $ambiguous_follower = ( + $after_name < $length && + $name_at < $length && + ( + ctype_alnum( $text[ $after_name ] ) || + '=' === $text[ $after_name ] + ) + ); + + // It's non-ambiguous, safe to leave it in. + if ( ! $ambiguous_follower ) { $match_byte_length = $after_name - $at; return $replacement; } - $follower_byte = ord( $text[ $after_name ] ); - if ( - 0x3D === $follower_byte || // EQUALS SIGN - ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || // ASCII digits 0-9 - ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || // ASCII upper alpha A-Z - ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) // ASCII lower alpha a-z - ) { + // It's ambiguous, which isn't allowed inside attributes. + if ( 'attribute' === $context ) { return null; }