From 72d3af42f96ff3b171c80b0a10029590f97cd3fb Mon Sep 17 00:00:00 2001 From: Chris Huber Date: Sat, 27 Jun 2026 19:51:13 -0400 Subject: [PATCH] Refactor: extract shared DOM/HTML helpers into DomHelpersTrait (slice 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure, behavior-preserving move of 22 broadly-shared low-level helpers out of HtmlTransformer into HtmlToBlocks\Support\DomHelpersTrait, which HtmlTransformer now `use`s. No logic, signature, or output changes — trait methods are inlined into the class so private state/method access is unchanged. Refs #242 (slice 2: shared DOM helpers). Unblocks future slices that depend on attr/elementSelector/safeFallbackHtml/boundedFallbackHtml/normalizedNavigationLabel and friends without dragging HtmlTransformer along. Parity 128->128, canonical contracts green, full `composer test` green. Co-Authored-By: Claude Opus 4.8 --- .../src/HtmlToBlocks/HtmlTransformer.php | 254 +---------------- .../HtmlToBlocks/Support/DomHelpersTrait.php | 269 ++++++++++++++++++ 2 files changed, 272 insertions(+), 251 deletions(-) create mode 100644 php-transformer/src/HtmlToBlocks/Support/DomHelpersTrait.php diff --git a/php-transformer/src/HtmlToBlocks/HtmlTransformer.php b/php-transformer/src/HtmlToBlocks/HtmlTransformer.php index 7df78626..41cd1aa0 100644 --- a/php-transformer/src/HtmlToBlocks/HtmlTransformer.php +++ b/php-transformer/src/HtmlToBlocks/HtmlTransformer.php @@ -14,6 +14,7 @@ use Automattic\BlocksEngine\PhpTransformer\HtmlToBlocks\Patterns\NavigationPattern; use Automattic\BlocksEngine\PhpTransformer\HtmlToBlocks\Patterns\PatternContext; use Automattic\BlocksEngine\PhpTransformer\HtmlToBlocks\Patterns\PatternRecognizerRegistry; +use Automattic\BlocksEngine\PhpTransformer\HtmlToBlocks\Support\DomHelpersTrait; use Automattic\BlocksEngine\PhpTransformer\WordPress\Runtime; use DOMDocument; use DOMElement; @@ -21,6 +22,8 @@ final class HtmlTransformer { + use DomHelpersTrait; + private const MAX_INTERACTION_CANDIDATES = 100; /** @@ -808,11 +811,6 @@ private function collectBlockNavigationItems(array $blocks, array &$items): void } } - private function normalizedNavigationLabel(string $label): string - { - return trim(preg_replace('/\s+/', ' ', html_entity_decode($this->runtime->stripAllTags($label), ENT_QUOTES | ENT_HTML5, 'UTF-8')) ?? $label); - } - private function sourceNavigationAnchorLabel(DOMElement $anchor): string { $label = $this->normalizedNavigationLabel($anchor->textContent ?? ''); @@ -2224,36 +2222,6 @@ private function sourceConversionMetadata(string $blockName, DOMElement $element ); } - private function innerHtml(DOMElement $element): string - { - $html = ''; - foreach ( $element->childNodes as $child ) { - $html .= $element->ownerDocument->saveHTML($child); - } - - return trim($html); - } - - private function innerHtmlPreservingWhitespace(DOMElement $element): string - { - $html = ''; - foreach ( $element->childNodes as $child ) { - $html .= $element->ownerDocument->saveHTML($child); - } - - return $html; - } - - private function outerHtml(DOMElement $element): string - { - return trim($element->ownerDocument->saveHTML($element) ?: ''); - } - - private function attr(DOMElement $element, string $name): string - { - return $element->hasAttribute($name) ? $element->getAttribute($name) : ''; - } - /** * @return array */ @@ -2269,16 +2237,6 @@ private function presentationAttributes(DOMElement $element): array ), static fn ($value): bool => is_array($value) ? array() !== $value : '' !== trim((string) $value)); } - private function safeAnchor(string $id): string - { - $id = trim($id); - if ( '' === $id || ! preg_match('/^[A-Za-z][A-Za-z0-9_-]*$/', $id) ) { - return ''; - } - - return $id; - } - private function mergedPresentationStyle(DOMElement $element): string { $inlineStyle = $this->attr($element, 'style'); @@ -2949,44 +2907,6 @@ private function dynamicTextContent(DOMElement $element): ?string return $this->attr($element, 'data-prefix') . $value . $this->attr($element, 'data-suffix'); } - private function hasClass(DOMElement $element, string $className): bool - { - return in_array($className, preg_split('/\s+/', trim($this->attr($element, 'class'))) ?: array(), true); - } - - private function elementSelector(DOMElement $element): string - { - $parts = array(); - $current = $element; - while ( $current instanceof DOMElement && 'body' !== strtolower($current->tagName) ) { - $tagName = strtolower($current->tagName); - $index = 1; - for ( $sibling = $current->previousSibling; $sibling instanceof DOMNode; $sibling = $sibling->previousSibling ) { - if ( $sibling instanceof DOMElement && strtolower($sibling->tagName) === $tagName ) { - ++$index; - } - } - array_unshift($parts, $tagName . ':nth-of-type(' . $index . ')'); - $current = $current->parentNode instanceof DOMElement ? $current->parentNode : null; - } - - return implode(' > ', $parts); - } - - /** - * @return array - */ - private function htmlAttributes(DOMElement $element): array - { - $attributes = array(); - foreach ( $element->attributes ?? array() as $attribute ) { - $attributes[$attribute->nodeName] = $attribute->nodeValue ?? ''; - } - - ksort($attributes); - return $attributes; - } - /** * @return array */ @@ -3022,19 +2942,6 @@ private function sourceContext(DOMElement $element): array ), static fn (mixed $value): bool => '' !== $value && array() !== $value); } - /** - * @return array - */ - private function ancestorTags(DOMElement $element): array - { - $tags = array(); - for ( $parent = $element->parentNode; $parent instanceof DOMElement && 'body' !== strtolower($parent->tagName); $parent = $parent->parentNode ) { - $tags[] = strtolower($parent->tagName); - } - - return $tags; - } - private function nearestPreviousHeadingText(DOMElement $element): string { for ( $node = $element->previousSibling; $node instanceof DOMNode; $node = $node->previousSibling ) { @@ -3046,14 +2953,6 @@ private function nearestPreviousHeadingText(DOMElement $element): string return ''; } - /** - * @return array - */ - private function classNames(DOMElement $element): array - { - return array_values(array_filter(preg_split('/\s+/', trim($this->attr($element, 'class'))) ?: array())); - } - /** * @return array */ @@ -3200,18 +3099,6 @@ private function safeSourceFragment(DOMElement $element): string return $html; } - private function childElementCount(DOMElement $element): int - { - $count = 0; - foreach ( $element->childNodes as $child ) { - if ( $child instanceof DOMElement ) { - ++$count; - } - } - - return $count; - } - /** * @return array> */ @@ -3567,21 +3454,6 @@ private function materializationHintForInteractionKind(string $kind): string }; } - private function closestTagName(DOMElement $element): ?string - { - return $element->parentNode instanceof DOMElement ? strtolower($element->parentNode->tagName) : null; - } - - private function firstChildElement(DOMElement $element, string $tagName): ?DOMElement - { - foreach ( $element->childNodes as $child ) { - if ( $child instanceof DOMElement && strtolower($child->tagName) === $tagName ) { - return $child; - } - } - return null; - } - private function figureMediaElement(DOMElement $figure, string $tagName): ?DOMElement { $direct = $this->firstChildElement($figure, $tagName); @@ -3635,39 +3507,6 @@ private function figureLinkedMediaAnchor(DOMElement $figure): ?DOMElement return $anchor instanceof DOMElement && $this->isImageOnlyAnchor($anchor) ? $anchor : null; } - private function onlyChildElement(DOMElement $element, string $tagName): ?DOMElement - { - $match = null; - foreach ( $element->childNodes as $child ) { - if ( XML_TEXT_NODE === $child->nodeType && '' === trim($child->textContent ?? '') ) { - continue; - } - - if ( ! $child instanceof DOMElement || strtolower($child->tagName) !== $tagName || null !== $match ) { - return null; - } - - $match = $child; - } - - return $match; - } - - /** - * @param array $excludedTags - */ - private function innerHtmlWithoutTags(DOMElement $element, array $excludedTags): string - { - $html = ''; - foreach ( $element->childNodes as $child ) { - if ( $child instanceof DOMElement && in_array(strtolower($child->tagName), $excludedTags, true) ) { - continue; - } - $html .= $element->ownerDocument->saveHTML($child); - } - return trim($html); - } - private function citationFromElement(DOMElement $element): string { foreach ( $element->childNodes as $child ) { @@ -3914,16 +3753,6 @@ private function listItems(DOMElement $list, array &$fallbacks): array return $items; } - private function safeFallbackHtml(DOMElement $element): string - { - $html = preg_replace('@<(script|style)[^>]*?>.*?@si', '', $this->outerHtml($element)) ?? ''; - $html = preg_replace('/\s+on[a-z]+\s*=\s*("[^"]*"|\'[^\']*\'|[^\s>]+)/i', '', $html) ?? ''; - $html = preg_replace('/\s+(?:href|src|xlink:href)\s*=\s*("\s*javascript:[^"]*"|\'\s*javascript:[^\']*\'|javascript:[^\s>]+)/i', '', $html) ?? ''; - $html = preg_replace('/\s+srcdoc\s*=\s*("[^"]*"|\'[^\']*\'|[^\s>]+)/i', '', $html) ?? ''; - - return trim($html); - } - /** * @return array|null */ @@ -4600,48 +4429,6 @@ private function safeCanvasAttributes(DOMElement $element): array return $safe; } - /** - * @return array{html: string, bytes: int, truncated: bool} - */ - private function boundedFallbackHtml(string $html): array - { - $bytes = strlen($html); - if ( $bytes > 2000 ) { - return array( - 'html' => substr($html, 0, 2000) . '...', - 'bytes' => $bytes, - 'truncated' => true, - ); - } - - return array( - 'html' => $html, - 'bytes' => $bytes, - 'truncated' => false, - ); - } - - /** - * @return array{text: string, bytes: int, truncated: bool} - */ - private function boundedFallbackText(string $text): array - { - $bytes = strlen($text); - if ( $bytes > 2000 ) { - return array( - 'text' => substr($text, 0, 2000) . '...', - 'bytes' => $bytes, - 'truncated' => true, - ); - } - - return array( - 'text' => $text, - 'bytes' => $bytes, - 'truncated' => false, - ); - } - /** * @return array */ @@ -5766,18 +5553,6 @@ private function looksLikeSplitLayout(DOMElement $element): bool return (bool) preg_match('/(?:^|[\s_-])(?:split|two[\s_-]?col|media[\s_-]?text|text[\s_-]?media|feature[\s_-]?row|hero[\s_-]?(?:inner|grid|content|layout)|content[\s_-]?grid)(?:$|[\s_-])/', $name); } - private function directElementChildCount(DOMElement $element): int - { - $count = 0; - foreach ( $element->childNodes as $child ) { - if ( $child instanceof DOMElement ) { - ++$count; - } - } - - return $count; - } - private function looksLikeDocumentationLayout(DOMElement $element): bool { $name = strtolower(trim($this->attr($element, 'class') . ' ' . $this->attr($element, 'id'))); @@ -6565,20 +6340,6 @@ private function nonCoreImageFigureClassName(DOMElement $figure): string return implode(' ', $classes); } - private function mergeClassNames(string ...$classNames): string - { - $classes = array(); - foreach ( $classNames as $className ) { - foreach ( preg_split('/\s+/', trim($className)) ?: array() as $class ) { - if ( '' !== $class && ! in_array($class, $classes, true) ) { - $classes[] = $class; - } - } - } - - return implode(' ', $classes); - } - /** * @return array */ @@ -6687,15 +6448,6 @@ private function sanitizedSyntaxHtml(DOMElement $element): string /** * @param array $attrs */ - private function htmlAttributeString(array $attrs): string - { - $html = ''; - foreach ( $attrs as $name => $value ) { - $html .= ' ' . $name . '="' . htmlspecialchars($value, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8') . '"'; - } - return $html; - } - /** * @return array> */ diff --git a/php-transformer/src/HtmlToBlocks/Support/DomHelpersTrait.php b/php-transformer/src/HtmlToBlocks/Support/DomHelpersTrait.php new file mode 100644 index 00000000..8fc1e149 --- /dev/null +++ b/php-transformer/src/HtmlToBlocks/Support/DomHelpersTrait.php @@ -0,0 +1,269 @@ +runtime->stripAllTags($label), ENT_QUOTES | ENT_HTML5, 'UTF-8')) ?? $label); + } + + private function innerHtml(DOMElement $element): string + { + $html = ''; + foreach ( $element->childNodes as $child ) { + $html .= $element->ownerDocument->saveHTML($child); + } + + return trim($html); + } + + private function innerHtmlPreservingWhitespace(DOMElement $element): string + { + $html = ''; + foreach ( $element->childNodes as $child ) { + $html .= $element->ownerDocument->saveHTML($child); + } + + return $html; + } + + private function outerHtml(DOMElement $element): string + { + return trim($element->ownerDocument->saveHTML($element) ?: ''); + } + + private function attr(DOMElement $element, string $name): string + { + return $element->hasAttribute($name) ? $element->getAttribute($name) : ''; + } + + private function safeAnchor(string $id): string + { + $id = trim($id); + if ( '' === $id || ! preg_match('/^[A-Za-z][A-Za-z0-9_-]*$/', $id) ) { + return ''; + } + + return $id; + } + + private function hasClass(DOMElement $element, string $className): bool + { + return in_array($className, preg_split('/\s+/', trim($this->attr($element, 'class'))) ?: array(), true); + } + + private function elementSelector(DOMElement $element): string + { + $parts = array(); + $current = $element; + while ( $current instanceof DOMElement && 'body' !== strtolower($current->tagName) ) { + $tagName = strtolower($current->tagName); + $index = 1; + for ( $sibling = $current->previousSibling; $sibling instanceof DOMNode; $sibling = $sibling->previousSibling ) { + if ( $sibling instanceof DOMElement && strtolower($sibling->tagName) === $tagName ) { + ++$index; + } + } + array_unshift($parts, $tagName . ':nth-of-type(' . $index . ')'); + $current = $current->parentNode instanceof DOMElement ? $current->parentNode : null; + } + + return implode(' > ', $parts); + } + + /** + * @return array + */ + private function htmlAttributes(DOMElement $element): array + { + $attributes = array(); + foreach ( $element->attributes ?? array() as $attribute ) { + $attributes[$attribute->nodeName] = $attribute->nodeValue ?? ''; + } + + ksort($attributes); + return $attributes; + } + + /** + * @return array + */ + private function ancestorTags(DOMElement $element): array + { + $tags = array(); + for ( $parent = $element->parentNode; $parent instanceof DOMElement && 'body' !== strtolower($parent->tagName); $parent = $parent->parentNode ) { + $tags[] = strtolower($parent->tagName); + } + + return $tags; + } + + /** + * @return array + */ + private function classNames(DOMElement $element): array + { + return array_values(array_filter(preg_split('/\s+/', trim($this->attr($element, 'class'))) ?: array())); + } + + private function childElementCount(DOMElement $element): int + { + $count = 0; + foreach ( $element->childNodes as $child ) { + if ( $child instanceof DOMElement ) { + ++$count; + } + } + + return $count; + } + + private function closestTagName(DOMElement $element): ?string + { + return $element->parentNode instanceof DOMElement ? strtolower($element->parentNode->tagName) : null; + } + + private function firstChildElement(DOMElement $element, string $tagName): ?DOMElement + { + foreach ( $element->childNodes as $child ) { + if ( $child instanceof DOMElement && strtolower($child->tagName) === $tagName ) { + return $child; + } + } + return null; + } + + private function onlyChildElement(DOMElement $element, string $tagName): ?DOMElement + { + $match = null; + foreach ( $element->childNodes as $child ) { + if ( XML_TEXT_NODE === $child->nodeType && '' === trim($child->textContent ?? '') ) { + continue; + } + + if ( ! $child instanceof DOMElement || strtolower($child->tagName) !== $tagName || null !== $match ) { + return null; + } + + $match = $child; + } + + return $match; + } + + /** + * @param array $excludedTags + */ + private function innerHtmlWithoutTags(DOMElement $element, array $excludedTags): string + { + $html = ''; + foreach ( $element->childNodes as $child ) { + if ( $child instanceof DOMElement && in_array(strtolower($child->tagName), $excludedTags, true) ) { + continue; + } + $html .= $element->ownerDocument->saveHTML($child); + } + return trim($html); + } + + private function safeFallbackHtml(DOMElement $element): string + { + $html = preg_replace('@<(script|style)[^>]*?>.*?@si', '', $this->outerHtml($element)) ?? ''; + $html = preg_replace('/\s+on[a-z]+\s*=\s*("[^"]*"|\'[^\']*\'|[^\s>]+)/i', '', $html) ?? ''; + $html = preg_replace('/\s+(?:href|src|xlink:href)\s*=\s*("\s*javascript:[^"]*"|\'\s*javascript:[^\']*\'|javascript:[^\s>]+)/i', '', $html) ?? ''; + $html = preg_replace('/\s+srcdoc\s*=\s*("[^"]*"|\'[^\']*\'|[^\s>]+)/i', '', $html) ?? ''; + + return trim($html); + } + + /** + * @return array{html: string, bytes: int, truncated: bool} + */ + private function boundedFallbackHtml(string $html): array + { + $bytes = strlen($html); + if ( $bytes > 2000 ) { + return array( + 'html' => substr($html, 0, 2000) . '...', + 'bytes' => $bytes, + 'truncated' => true, + ); + } + + return array( + 'html' => $html, + 'bytes' => $bytes, + 'truncated' => false, + ); + } + + /** + * @return array{text: string, bytes: int, truncated: bool} + */ + private function boundedFallbackText(string $text): array + { + $bytes = strlen($text); + if ( $bytes > 2000 ) { + return array( + 'text' => substr($text, 0, 2000) . '...', + 'bytes' => $bytes, + 'truncated' => true, + ); + } + + return array( + 'text' => $text, + 'bytes' => $bytes, + 'truncated' => false, + ); + } + + private function directElementChildCount(DOMElement $element): int + { + $count = 0; + foreach ( $element->childNodes as $child ) { + if ( $child instanceof DOMElement ) { + ++$count; + } + } + + return $count; + } + + private function mergeClassNames(string ...$classNames): string + { + $classes = array(); + foreach ( $classNames as $className ) { + foreach ( preg_split('/\s+/', trim($className)) ?: array() as $class ) { + if ( '' !== $class && ! in_array($class, $classes, true) ) { + $classes[] = $class; + } + } + } + + return implode(' ', $classes); + } + + private function htmlAttributeString(array $attrs): string + { + $html = ''; + foreach ( $attrs as $name => $value ) { + $html .= ' ' . $name . '="' . htmlspecialchars($value, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8') . '"'; + } + return $html; + } +}