Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f92da80
Fix attribute legacy reference follower checks
sirreal Jun 12, 2026
2876d8a
Merge remote-tracking branch 'upstream/trunk' into HEAD
sirreal Jun 15, 2026
8bb66dc
Fix coding standards for decoder legacy follower checks
sirreal Jun 15, 2026
cc0d43a
Merge branch 'trunk' into fix/html-decoder-legacy-follower-ascii
sirreal Jun 23, 2026
2187e33
Add ticket number
sirreal Jun 23, 2026
e6e98c7
Tests: Broaden UTF-8 locale candidates for HTML decoder
sirreal Jun 23, 2026
f9c7d74
Improve tests
sirreal Jun 23, 2026
ac0d842
Improve test logic
sirreal Jun 23, 2026
bd5566e
Test fixups
sirreal Jun 23, 2026
65e3937
Fix test language
sirreal Jun 23, 2026
cb3b277
Fix test description
sirreal Jun 23, 2026
deccaae
Rework decoding bail to match spec, improve perf and clarity
sirreal Jun 23, 2026
63a2fc1
clean up language
sirreal Jun 23, 2026
e0d7a45
Improve comment
sirreal Jun 23, 2026
a2a9357
Tighten up spec
sirreal Jun 23, 2026
46c8153
Merge branch 'trunk' into fix/html-decoder-legacy-follower-ascii
sirreal Jun 23, 2026
998be41
Fix data provider test name
sirreal Jun 23, 2026
26a8d10
Fix tests phpdoc
sirreal Jun 23, 2026
e2ed016
Revert ctype_alnum() fix
sirreal Jun 23, 2026
118ef06
Revert "Revert ctype_alnum() fix"
sirreal Jun 23, 2026
3040e14
Improve language and add examples
sirreal Jun 23, 2026
b55f333
Improve documentation
sirreal Jun 23, 2026
85028e0
Merge branch 'trunk' into fix/html-decoder-legacy-follower-ascii
sirreal Jun 25, 2026
ebeb091
Improve comment clarity about attribute special case
sirreal Jun 25, 2026
d3d88f0
Add clause about URL query strings
sirreal Jun 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 49 additions & 23 deletions src/wp-includes/html-api/class-wp-html-decoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -367,34 +367,60 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat

$after_name = $name_at + $name_length;

// If the match ended with a semicolon then it should always be decoded.
if ( ';' === $text[ $name_at + $name_length - 1 ] ) {
$match_byte_length = $after_name - $at;
return $replacement;
}

/*
* At this point though there's a match for an entry in the named
* character reference table but the match doesn't end in `;`.
* It may be allowed if it's followed by something unambiguous.
/**
* For historical reasons, a matched named character reference is left as literal
* text (its decoded replacement is not used) when all of the following hold:
*
* 1. It was matched in attribute context.
* 2. The match does not end in U+003B SEMICOLON (;) — i.e. it is one of the
* legacy forms recognized without a trailing semicolon.
* 3. The next input character is U+003D EQUALS SIGN (=) or an ASCII alphanumeric.
*
* Some illustrative examples follow. Note that both `not` and `not;` appear in the
* named character references list. References start with `&` and typically end with
* `;`, but the legacy forms are recognized without one.
*
* - In _data context_, "&notme" is decoded to "¬me": condition 1 fails (not an
* attribute), so the reference is decoded.
* - In _attribute context_, "¬me" is decoded to "¬me": the longest match is
* "not;", which ends in a semicolon, so condition 2 fails.
* - In _attribute context_, "&not己" is decoded to "¬己": the following character
* "己" is a letter but not an ASCII alphanumeric (nor "="), so condition 3 fails.
* - In _attribute context_, "&not" is decoded to "¬": there is no next input
* character, so condition 3 fails.
* - In _attribute context_, "&not=me" is left as the literal text "&not=me": all
* three conditions hold.
* - In _attribute context_, "&notme" is left as the literal text "&notme": all
* three conditions hold.
*
* Without these special rules, ordinary URL query strings could have surprising
* replacements applied. Consider:
*
* <a href="/?random&degree&gt=0&lt=360&not=90">
*
* These special rules preserve the literal attribute value:
* `/?random&degree&gt=0&lt=360&not=90`. Without them, the value would be decoded
* as `/?random°ree>=0<=360¬=90`, likely not the intended value.
*
* (Authors should not rely on this. Escaping the example as
* `/?random&amp;degree&amp;gt=0&amp;lt=360&amp;not=90` produces the intended
* value regardless of the following character.)
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
* @see https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
*/
$ambiguous_follower = (
$after_name < $length &&
$name_at < $length &&
(
ctype_alnum( $text[ $after_name ] ) ||
'=' === $text[ $after_name ]
)
);

// It's non-ambiguous, safe to leave it in.
if ( ! $ambiguous_follower ) {
if ( 'attribute' !== $context || ';' === $text[ $after_name - 1 ] || $after_name >= $length ) {
$match_byte_length = $after_name - $at;
return $replacement;
}

// It's ambiguous, which isn't allowed inside attributes.
if ( 'attribute' === $context ) {
$follower_byte = ord( $text[ $after_name ] );
if (
0x3D === $follower_byte || // EQUALS SIGN
( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || // ASCII digits 0-9
( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || // ASCII upper alpha A-Z
( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) // ASCII lower alpha a-z
) {
return null;
}

Expand Down
154 changes: 154 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlDecoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,51 @@
* @coversDefaultClass WP_HTML_Decoder
*/
class Tests_HtmlApi_WpHtmlDecoder extends WP_UnitTestCase {
/**
* Original LC_CTYPE locale.
*
* @var string|null
*/
private static ?string $original_lc_ctype = null;

/**
* Locale where ctype_alnum() classifies high-bit bytes as alphanumeric.
*
* @var string|null
*/
private static ?string $problematic_lc_ctype = null;

public static function set_up_before_class() {
parent::set_up_before_class();

self::$original_lc_ctype = setlocale( LC_CTYPE, 0 );

// Find a locale where ctype_alnum() classifies high-bit bytes as alphanumeric.
$locale_candidates = array(
'C.UTF-8',
'C.utf8',
'en_US.UTF-8',
'en_US.utf8',
'en_GB.UTF-8',
'en_GB.utf8',
);
Comment on lines +35 to +42

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know whether it's worth checking multiple locales or all of these locales are likely to all have the same behavior on the same system. For example, my system has the issue with "C.UTF-8", the other .UTF-8 locales listed here, and more.

foreach ( $locale_candidates as $locale ) {
$candidate_locale = setlocale( LC_CTYPE, $locale );

if ( false !== $candidate_locale && ctype_alnum( "\xC2" ) ) {
self::$problematic_lc_ctype = $candidate_locale;
break;
}
}

setlocale( LC_CTYPE, self::$original_lc_ctype );
}

public function tear_down() {
setlocale( LC_CTYPE, self::$original_lc_ctype );
parent::tear_down();
}

/**
* Ensures proper decoding of edge cases.
*
Expand Down Expand Up @@ -61,6 +106,115 @@ static function ( int $errno, string $errstr ) use ( &$errors ) {
$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
}

/**
* Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes.
*
* @dataProvider data_semicolonless_attribute_behaviors
*
* @ticket 65372
*/
public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower( string $encoded_attribute_value, string $expected, string $expected_decode, int $expected_byte_length ): void {

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the test that fails on trunk depending on the system.

if ( null !== self::$problematic_lc_ctype ) {
setlocale( LC_CTYPE, self::$problematic_lc_ctype );
}

$this->assertSame(
$expected,
WP_HTML_Decoder::decode_attribute( $encoded_attribute_value ),
'Failed to decode the full attribute value as expected.'
);

$match_byte_length = null;
$this->assertSame(
$expected_decode,
WP_HTML_Decoder::read_character_reference( 'attribute', $encoded_attribute_value, 0, $match_byte_length ),
'Failed to decode the character reference as expected.'
);
$this->assertSame( $expected_byte_length, $match_byte_length, 'Failed to produce expected byte length.' );
}

/**
* Data provider.
*
* Attribute values encoded with character references including followers that are
* treated as alphanumerics by `ctype_alnum()` on some systems, but should never
* be recognized as ASCII Alphanumerics according to the HTML standards.
*
* @see https://html.spec.whatwg.org/#named-character-reference-state
*
* @return array<array{
* string, // Encoded attribute value.
* string, // Expected full decode.
* string, // Expected character decode.
* int, // Replaced character reference byte length.
* }> Test cases.
*/
public static function data_semicolonless_attribute_behaviors(): array {
return array(
array( '&copy¯\_(ツ)_/¯', '©¯\_(ツ)_/¯', '©', 5 ),
array( '&notಠ_ಠ', '¬ಠ_ಠ', '¬', 4 ),
array( '&nbsp£20', "\u{00A0}£20", "\u{00A0}", 5 ),
array( '&nbsp🎉', "\u{00A0}🎉", "\u{00A0}", 5 ),
array( '&reg™', '®™', '®', 4 ),
);
}

/**
* Ensures ambiguous ampersand is recognized with trailing ASCII alphanumerics.
*
* @dataProvider data_semicolonless_attribute_character_reference_no_decode_followers
*
* @ticket 65372
*
* @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower.
*/
public function test_ascii_alphanumeric_attribute_follower_is_ambiguous( string $raw_attribute ): void {
$this->assertSame(
$raw_attribute,
WP_HTML_Decoder::decode_attribute( $raw_attribute ),
'Should not have decoded an ambiguous semicolonless legacy reference.'
);

$match_byte_length = 'sentinel';
$this->assertNull(
WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ),
'Should not have matched an ambiguous semicolonless legacy reference.'
);
$this->assertSame( 'sentinel', $match_byte_length );
}

/**
* Data provider.
*
* HTML character references with followers that trigger the literal flush behavior
* when parsing attribute values. HTML defines this as `=` or an ASCII alphanumeric character.
*
* > An ASCII alphanumeric is an ASCII digit or ASCII alpha.
* > An ASCII alpha is an ASCII upper alpha or ASCII lower alpha.
*
* @see https://html.spec.whatwg.org/#named-character-reference-state
*
* @return Generator<string, array{ string }> Test cases.
*/
public static function data_semicolonless_attribute_character_reference_no_decode_followers(): Generator {
yield "Equals sign follower '='" => array( '&Aacute=' );
// > An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive.
for ( $i = 0x30; $i <= 0x39; $i++ ) {
$char = chr( $i );
yield "ASCII digit follower '{$char}'" => array( "&Aacute{$char}" );
}
// > An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive.
for ( $i = 0x41; $i <= 0x5A; $i++ ) {
$char = chr( $i );
yield "ASCII upper alpha follower '{$char}'" => array( "&Aacute{$char}" );
}
// > An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive.
for ( $i = 0x61; $i <= 0x7A; $i++ ) {
$char = chr( $i );
yield "ASCII lower alpha follower '{$char}'" => array( "&Aacute{$char}" );
}
}

/**
* Ensures proper detection of attribute prefixes ignoring ASCII case.
*
Expand Down
Loading