-
Notifications
You must be signed in to change notification settings - Fork 3.5k
HTML Decoder: Replace system-dependent ctype check with ASCII byte comparison #12286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
sirreal
wants to merge
25
commits into
WordPress:trunk
Choose a base branch
from
sirreal:fix/html-decoder-legacy-follower-ascii
base: trunk
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+203
−23
Open
Changes from all commits
Commits
Show all changes
25 commits
Select commit
Hold shift + click to select a range
f92da80
Fix attribute legacy reference follower checks
sirreal 2876d8a
Merge remote-tracking branch 'upstream/trunk' into HEAD
sirreal 8bb66dc
Fix coding standards for decoder legacy follower checks
sirreal cc0d43a
Merge branch 'trunk' into fix/html-decoder-legacy-follower-ascii
sirreal 2187e33
Add ticket number
sirreal e6e98c7
Tests: Broaden UTF-8 locale candidates for HTML decoder
sirreal f9c7d74
Improve tests
sirreal ac0d842
Improve test logic
sirreal bd5566e
Test fixups
sirreal 65e3937
Fix test language
sirreal cb3b277
Fix test description
sirreal deccaae
Rework decoding bail to match spec, improve perf and clarity
sirreal 63a2fc1
clean up language
sirreal e0d7a45
Improve comment
sirreal a2a9357
Tighten up spec
sirreal 46c8153
Merge branch 'trunk' into fix/html-decoder-legacy-follower-ascii
sirreal 998be41
Fix data provider test name
sirreal 26a8d10
Fix tests phpdoc
sirreal e2ed016
Revert ctype_alnum() fix
sirreal 118ef06
Revert "Revert ctype_alnum() fix"
sirreal 3040e14
Improve language and add examples
sirreal b55f333
Improve documentation
sirreal 85028e0
Merge branch 'trunk' into fix/html-decoder-legacy-follower-ascii
sirreal ebeb091
Improve comment clarity about attribute special case
sirreal d3d88f0
Add clause about URL query strings
sirreal File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,51 @@ | |
| * @coversDefaultClass WP_HTML_Decoder | ||
| */ | ||
| class Tests_HtmlApi_WpHtmlDecoder extends WP_UnitTestCase { | ||
| /** | ||
| * Original LC_CTYPE locale. | ||
| * | ||
| * @var string|null | ||
| */ | ||
| private static ?string $original_lc_ctype = null; | ||
|
|
||
| /** | ||
| * Locale where ctype_alnum() classifies high-bit bytes as alphanumeric. | ||
| * | ||
| * @var string|null | ||
| */ | ||
| private static ?string $problematic_lc_ctype = null; | ||
|
|
||
| public static function set_up_before_class() { | ||
| parent::set_up_before_class(); | ||
|
|
||
| self::$original_lc_ctype = setlocale( LC_CTYPE, 0 ); | ||
|
|
||
| // Find a locale where ctype_alnum() classifies high-bit bytes as alphanumeric. | ||
| $locale_candidates = array( | ||
| 'C.UTF-8', | ||
| 'C.utf8', | ||
| 'en_US.UTF-8', | ||
| 'en_US.utf8', | ||
| 'en_GB.UTF-8', | ||
| 'en_GB.utf8', | ||
| ); | ||
| foreach ( $locale_candidates as $locale ) { | ||
| $candidate_locale = setlocale( LC_CTYPE, $locale ); | ||
|
|
||
| if ( false !== $candidate_locale && ctype_alnum( "\xC2" ) ) { | ||
| self::$problematic_lc_ctype = $candidate_locale; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| setlocale( LC_CTYPE, self::$original_lc_ctype ); | ||
| } | ||
|
|
||
| public function tear_down() { | ||
| setlocale( LC_CTYPE, self::$original_lc_ctype ); | ||
| parent::tear_down(); | ||
| } | ||
|
|
||
| /** | ||
| * Ensures proper decoding of edge cases. | ||
| * | ||
|
|
@@ -61,6 +106,115 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { | |
| $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); | ||
| } | ||
|
|
||
| /** | ||
| * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes. | ||
| * | ||
| * @dataProvider data_semicolonless_attribute_behaviors | ||
| * | ||
| * @ticket 65372 | ||
| */ | ||
| public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower( string $encoded_attribute_value, string $expected, string $expected_decode, int $expected_byte_length ): void { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the test that fails on |
||
| if ( null !== self::$problematic_lc_ctype ) { | ||
| setlocale( LC_CTYPE, self::$problematic_lc_ctype ); | ||
| } | ||
|
|
||
| $this->assertSame( | ||
| $expected, | ||
| WP_HTML_Decoder::decode_attribute( $encoded_attribute_value ), | ||
| 'Failed to decode the full attribute value as expected.' | ||
| ); | ||
|
|
||
| $match_byte_length = null; | ||
| $this->assertSame( | ||
| $expected_decode, | ||
| WP_HTML_Decoder::read_character_reference( 'attribute', $encoded_attribute_value, 0, $match_byte_length ), | ||
| 'Failed to decode the character reference as expected.' | ||
| ); | ||
| $this->assertSame( $expected_byte_length, $match_byte_length, 'Failed to produce expected byte length.' ); | ||
| } | ||
|
|
||
| /** | ||
| * Data provider. | ||
| * | ||
| * Attribute values encoded with character references including followers that are | ||
| * treated as alphanumerics by `ctype_alnum()` on some systems, but should never | ||
| * be recognized as ASCII Alphanumerics according to the HTML standards. | ||
| * | ||
| * @see https://html.spec.whatwg.org/#named-character-reference-state | ||
| * | ||
| * @return array<array{ | ||
| * string, // Encoded attribute value. | ||
| * string, // Expected full decode. | ||
| * string, // Expected character decode. | ||
| * int, // Replaced character reference byte length. | ||
| * }> Test cases. | ||
| */ | ||
| public static function data_semicolonless_attribute_behaviors(): array { | ||
| return array( | ||
| array( '©¯\_(ツ)_/¯', '©¯\_(ツ)_/¯', '©', 5 ), | ||
| array( '¬ಠ_ಠ', '¬ಠ_ಠ', '¬', 4 ), | ||
| array( ' £20', "\u{00A0}£20", "\u{00A0}", 5 ), | ||
| array( ' 🎉', "\u{00A0}🎉", "\u{00A0}", 5 ), | ||
| array( '®™', '®™', '®', 4 ), | ||
| ); | ||
| } | ||
|
|
||
| /** | ||
| * Ensures ambiguous ampersand is recognized with trailing ASCII alphanumerics. | ||
| * | ||
| * @dataProvider data_semicolonless_attribute_character_reference_no_decode_followers | ||
| * | ||
| * @ticket 65372 | ||
| * | ||
| * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower. | ||
| */ | ||
| public function test_ascii_alphanumeric_attribute_follower_is_ambiguous( string $raw_attribute ): void { | ||
| $this->assertSame( | ||
| $raw_attribute, | ||
| WP_HTML_Decoder::decode_attribute( $raw_attribute ), | ||
| 'Should not have decoded an ambiguous semicolonless legacy reference.' | ||
| ); | ||
|
|
||
| $match_byte_length = 'sentinel'; | ||
| $this->assertNull( | ||
| WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), | ||
| 'Should not have matched an ambiguous semicolonless legacy reference.' | ||
| ); | ||
| $this->assertSame( 'sentinel', $match_byte_length ); | ||
| } | ||
|
|
||
| /** | ||
| * Data provider. | ||
| * | ||
| * HTML character references with followers that trigger the literal flush behavior | ||
| * when parsing attribute values. HTML defines this as `=` or an ASCII alphanumeric character. | ||
| * | ||
| * > An ASCII alphanumeric is an ASCII digit or ASCII alpha. | ||
| * > An ASCII alpha is an ASCII upper alpha or ASCII lower alpha. | ||
| * | ||
| * @see https://html.spec.whatwg.org/#named-character-reference-state | ||
| * | ||
| * @return Generator<string, array{ string }> Test cases. | ||
| */ | ||
| public static function data_semicolonless_attribute_character_reference_no_decode_followers(): Generator { | ||
| yield "Equals sign follower '='" => array( 'Á=' ); | ||
| // > An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive. | ||
| for ( $i = 0x30; $i <= 0x39; $i++ ) { | ||
| $char = chr( $i ); | ||
| yield "ASCII digit follower '{$char}'" => array( "Á{$char}" ); | ||
| } | ||
| // > An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive. | ||
| for ( $i = 0x41; $i <= 0x5A; $i++ ) { | ||
| $char = chr( $i ); | ||
| yield "ASCII upper alpha follower '{$char}'" => array( "Á{$char}" ); | ||
| } | ||
| // > An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive. | ||
| for ( $i = 0x61; $i <= 0x7A; $i++ ) { | ||
| $char = chr( $i ); | ||
| yield "ASCII lower alpha follower '{$char}'" => array( "Á{$char}" ); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Ensures proper detection of attribute prefixes ignoring ASCII case. | ||
| * | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know whether it's worth checking multiple locales or all of these locales are likely to all have the same behavior on the same system. For example, my system has the issue with
"C.UTF-8", the other.UTF-8locales listed here, and more.