diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index b6c240bdcff5f..e4634f8fa23ed 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -367,34 +367,60 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat $after_name = $name_at + $name_length; - // If the match ended with a semicolon then it should always be decoded. - if ( ';' === $text[ $name_at + $name_length - 1 ] ) { - $match_byte_length = $after_name - $at; - return $replacement; - } - - /* - * At this point though there's a match for an entry in the named - * character reference table but the match doesn't end in `;`. - * It may be allowed if it's followed by something unambiguous. + /** + * For historical reasons, a matched named character reference is left as literal + * text (its decoded replacement is not used) when all of the following hold: + * + * 1. It was matched in attribute context. + * 2. The match does not end in U+003B SEMICOLON (;) — i.e. it is one of the + * legacy forms recognized without a trailing semicolon. + * 3. The next input character is U+003D EQUALS SIGN (=) or an ASCII alphanumeric. + * + * Some illustrative examples follow. Note that both `not` and `not;` appear in the + * named character references list. References start with `&` and typically end with + * `;`, but the legacy forms are recognized without one. + * + * - In _data context_, "¬me" is decoded to "¬me": condition 1 fails (not an + * attribute), so the reference is decoded. + * - In _attribute context_, "¬me" is decoded to "¬me": the longest match is + * "not;", which ends in a semicolon, so condition 2 fails. + * - In _attribute context_, "¬己" is decoded to "¬己": the following character + * "己" is a letter but not an ASCII alphanumeric (nor "="), so condition 3 fails. + * - In _attribute context_, "¬" is decoded to "¬": there is no next input + * character, so condition 3 fails. + * - In _attribute context_, "¬=me" is left as the literal text "¬=me": all + * three conditions hold. + * - In _attribute context_, "¬me" is left as the literal text "¬me": all + * three conditions hold. + * + * Without these special rules, ordinary URL query strings could have surprising + * replacements applied. Consider: + * + * + * + * The literal attribute value `/?random°ree>=0<=360¬=90` is preserved + * by the special handling. Otherwise, the value would decode to + * `/?random°ree>=0<=360¬=90`, which is unlikely to be the author's intent. + * + * (Authors should not rely on this. Escaping the example as + * `/?random&degree&gt=0&lt=360&not=90` produces the intended + * value regardless of the following character.) + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state + * @see https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references */ - $ambiguous_follower = ( - $after_name < $length && - $name_at < $length && - ( - ctype_alnum( $text[ $after_name ] ) || - '=' === $text[ $after_name ] - ) - ); - - // It's non-ambiguous, safe to leave it in. - if ( ! $ambiguous_follower ) { + if ( 'attribute' !== $context || ';' === $text[ $after_name - 1 ] || $after_name >= $length ) { $match_byte_length = $after_name - $at; return $replacement; } - // It's ambiguous, which isn't allowed inside attributes. - if ( 'attribute' === $context ) { + $follower_byte = ord( $text[ $after_name ] ); + if ( + 0x3D === $follower_byte || // EQUALS SIGN + ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || // ASCII digits 0-9 + ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || // ASCII upper alpha A-Z + ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) // ASCII lower alpha a-z + ) { return null; } diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 97954f4eb3e30..158115cdfbf06 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -12,6 +12,55 @@ * @coversDefaultClass WP_HTML_Decoder */ class Tests_HtmlApi_WpHtmlDecoder extends WP_UnitTestCase { + /** + * Original LC_CTYPE locale. + * + * @var string|bool + */ + private static $original_lc_ctype = false; + + /** + * Locale where ctype_alnum() classifies high-bit bytes as alphanumeric. + * + * @var string|null + */ + private static ?string $problematic_lc_ctype = null; + + public static function set_up_before_class() { + parent::set_up_before_class(); + + self::$original_lc_ctype = setlocale( LC_CTYPE, 0 ); + + // Find a locale where ctype_alnum() classifies high-bit bytes as alphanumeric. + $locale_candidates = array( + 'C.UTF-8', + 'C.utf8', + 'en_US.UTF-8', + 'en_US.utf8', + 'en_GB.UTF-8', + 'en_GB.utf8', + ); + foreach ( $locale_candidates as $locale ) { + $candidate_locale = setlocale( LC_CTYPE, $locale ); + + if ( false !== $candidate_locale && ctype_alnum( "\xC2" ) ) { + self::$problematic_lc_ctype = $candidate_locale; + break; + } + } + + if ( self::$original_lc_ctype ) { + setlocale( LC_CTYPE, self::$original_lc_ctype ); + } + } + + public function tear_down() { + if ( self::$original_lc_ctype ) { + setlocale( LC_CTYPE, self::$original_lc_ctype ); + } + parent::tear_down(); + } + /** * Ensures proper decoding of edge cases. * @@ -61,6 +110,115 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); } + /** + * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes. + * + * @dataProvider data_semicolonless_attribute_behaviors + * + * @ticket 65372 + */ + public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower( string $encoded_attribute_value, string $expected, string $expected_decode, int $expected_byte_length ): void { + if ( null !== self::$problematic_lc_ctype ) { + setlocale( LC_CTYPE, self::$problematic_lc_ctype ); + } + + $this->assertSame( + $expected, + WP_HTML_Decoder::decode_attribute( $encoded_attribute_value ), + 'Failed to decode the full attribute value as expected.' + ); + + $match_byte_length = null; + $this->assertSame( + $expected_decode, + WP_HTML_Decoder::read_character_reference( 'attribute', $encoded_attribute_value, 0, $match_byte_length ), + 'Failed to decode the character reference as expected.' + ); + $this->assertSame( $expected_byte_length, $match_byte_length, 'Failed to produce expected byte length.' ); + } + + /** + * Data provider. + * + * Attribute values encoded with character references including followers that are + * treated as alphanumerics by `ctype_alnum()` on some systems, but should never + * be recognized as ASCII Alphanumerics according to the HTML standards. + * + * @see https://html.spec.whatwg.org/#named-character-reference-state + * + * @return array Test cases. + */ + public static function data_semicolonless_attribute_behaviors(): array { + return array( + array( '©¯\_(ツ)_/¯', '©¯\_(ツ)_/¯', '©', 5 ), + array( '¬ಠ_ಠ', '¬ಠ_ಠ', '¬', 4 ), + array( ' £20', "\u{00A0}£20", "\u{00A0}", 5 ), + array( ' 🎉', "\u{00A0}🎉", "\u{00A0}", 5 ), + array( '®™', '®™', '®', 4 ), + ); + } + + /** + * Ensures ambiguous ampersand is recognized with trailing ASCII alphanumerics. + * + * @dataProvider data_semicolonless_attribute_character_reference_no_decode_followers + * + * @ticket 65372 + * + * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower. + */ + public function test_ascii_alphanumeric_attribute_follower_is_ambiguous( string $raw_attribute ): void { + $this->assertSame( + $raw_attribute, + WP_HTML_Decoder::decode_attribute( $raw_attribute ), + 'Should not have decoded an ambiguous semicolonless legacy reference.' + ); + + $match_byte_length = 'sentinel'; + $this->assertNull( + WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), + 'Should not have matched an ambiguous semicolonless legacy reference.' + ); + $this->assertSame( 'sentinel', $match_byte_length ); + } + + /** + * Data provider. + * + * HTML character references with followers that trigger the literal flush behavior + * when parsing attribute values. HTML defines this as `=` or an ASCII alphanumeric character. + * + * > An ASCII alphanumeric is an ASCII digit or ASCII alpha. + * > An ASCII alpha is an ASCII upper alpha or ASCII lower alpha. + * + * @see https://html.spec.whatwg.org/#named-character-reference-state + * + * @return Generator Test cases. + */ + public static function data_semicolonless_attribute_character_reference_no_decode_followers(): Generator { + yield "Equals sign follower '='" => array( 'Á=' ); + // > An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive. + for ( $i = 0x30; $i <= 0x39; $i++ ) { + $char = chr( $i ); + yield "ASCII digit follower '{$char}'" => array( "Á{$char}" ); + } + // > An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive. + for ( $i = 0x41; $i <= 0x5A; $i++ ) { + $char = chr( $i ); + yield "ASCII upper alpha follower '{$char}'" => array( "Á{$char}" ); + } + // > An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive. + for ( $i = 0x61; $i <= 0x7A; $i++ ) { + $char = chr( $i ); + yield "ASCII lower alpha follower '{$char}'" => array( "Á{$char}" ); + } + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. *