sirreal · sirreal · Jun 29, 2026
diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -44,6 +44,39 @@
  *         $processor->add_class( 'responsive-image' );
  *     }
  *
+ * #### Reading ordinary text from a subtree
+ *
+ * Ordinary text extraction is usually a tree-aware operation, so use the HTML
+ * Processor and walk the subtree. Append only `#text` tokens unless the caller
+ * intentionally asks for another token type. Do not use
+ * {@see WP_HTML_Tag_Processor::get_modifiable_text} itself as a test for
+ * ordinary text, because comments and special elements can also carry
+ * modifiable text.
+ *
+ * Example:
+ *
+ *     $processor = WP_HTML_Processor::create_fragment( $html );
+ *     if ( $processor->next_tag( 'ARTICLE' ) ) {
+ *         $article_depth = $processor->get_current_depth();
+ *         $text          = '';
+ *
+ *         while ( $processor->next_token() && $processor->get_current_depth() >= $article_depth ) {
+ *             if ( '#text' === $processor->get_token_type() ) {
+ *                 $text .= $processor->get_modifiable_text();
+ *             }
+ *         }
+ *     }
+ *
+ * HTML elements whose contents cannot contain markup, such as SCRIPT, STYLE,
+ * TITLE, and TEXTAREA, do not expose their contents as child `#text` tokens.
+ * Their contents are available on the element token itself and should be read
+ * only when the caller specifically asks for that element's own contents.
+ *
+ * For read-only extraction, parser state such as
+ * {@see WP_HTML_Tag_Processor::paused_at_incomplete_token} or
+ * {@see WP_HTML_Processor::get_last_error} reports how the scan ended. Whether
+ * already-collected text is acceptable is the caller's policy.
+ *
  * #### Breadcrumbs
  *
  * Breadcrumbs represent the stack of open elements from the root
@@ -5577,6 +5610,13 @@ public function class_list() {
 	 * that a token has modifiable text, and a token with modifiable text may
 	 * have an empty string (e.g. a comment with no contents).
 	 *
+	 * This method is not a predicate for ordinary text. For ordinary subtree
+	 * text extraction, first require `get_token_type() === '#text'`, then read
+	 * this method. HTML SCRIPT, STYLE, TITLE, and TEXTAREA contents are carried
+	 * on the element token itself, with no child `#text` token; read that
+	 * opener-carried text only when the caller specifically asks for the
+	 * element's own contents.
+	 *
 	 * @since 6.6.0 Subclassed for the HTML Processor.
 	 *
 	 * @return string

diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -3639,6 +3639,15 @@ public function subdivide_text_appropriately(): bool {
 	 * that a token has modifiable text, and a token with modifiable text may
 	 * have an empty string (e.g. a comment with no contents).
 	 *
+	 * The returned string is already decoded where HTML decodes text: `#text`
+	 * nodes, TITLE contents, and TEXTAREA contents return character references
+	 * as the characters they represent, so `&amp;` is returned as `&`. Do not
+	 * decode the returned string again. SCRIPT, STYLE, and comment contents are
+	 * returned verbatim because HTML does not decode character references there.
+	 *
+	 * The returned string is UTF-8. When measuring or slicing it by code points,
+	 * pass an explicit encoding, for example `mb_strlen( $text, 'UTF-8' )`.
+	 *
 	 * Limitations:
 	 *
 	 *  - This function will not strip the leading newline appropriately