diff --git a/php-transformer/src/HtmlToBlocks/HtmlTransformer.php b/php-transformer/src/HtmlToBlocks/HtmlTransformer.php
index 7df78626..41cd1aa0 100644
--- a/php-transformer/src/HtmlToBlocks/HtmlTransformer.php
+++ b/php-transformer/src/HtmlToBlocks/HtmlTransformer.php
@@ -14,6 +14,7 @@
use Automattic\BlocksEngine\PhpTransformer\HtmlToBlocks\Patterns\NavigationPattern;
use Automattic\BlocksEngine\PhpTransformer\HtmlToBlocks\Patterns\PatternContext;
use Automattic\BlocksEngine\PhpTransformer\HtmlToBlocks\Patterns\PatternRecognizerRegistry;
+use Automattic\BlocksEngine\PhpTransformer\HtmlToBlocks\Support\DomHelpersTrait;
use Automattic\BlocksEngine\PhpTransformer\WordPress\Runtime;
use DOMDocument;
use DOMElement;
@@ -21,6 +22,8 @@
final class HtmlTransformer
{
+ use DomHelpersTrait;
+
private const MAX_INTERACTION_CANDIDATES = 100;
/**
@@ -808,11 +811,6 @@ private function collectBlockNavigationItems(array $blocks, array &$items): void
}
}
- private function normalizedNavigationLabel(string $label): string
- {
- return trim(preg_replace('/\s+/', ' ', html_entity_decode($this->runtime->stripAllTags($label), ENT_QUOTES | ENT_HTML5, 'UTF-8')) ?? $label);
- }
-
private function sourceNavigationAnchorLabel(DOMElement $anchor): string
{
$label = $this->normalizedNavigationLabel($anchor->textContent ?? '');
@@ -2224,36 +2222,6 @@ private function sourceConversionMetadata(string $blockName, DOMElement $element
);
}
- private function innerHtml(DOMElement $element): string
- {
- $html = '';
- foreach ( $element->childNodes as $child ) {
- $html .= $element->ownerDocument->saveHTML($child);
- }
-
- return trim($html);
- }
-
- private function innerHtmlPreservingWhitespace(DOMElement $element): string
- {
- $html = '';
- foreach ( $element->childNodes as $child ) {
- $html .= $element->ownerDocument->saveHTML($child);
- }
-
- return $html;
- }
-
- private function outerHtml(DOMElement $element): string
- {
- return trim($element->ownerDocument->saveHTML($element) ?: '');
- }
-
- private function attr(DOMElement $element, string $name): string
- {
- return $element->hasAttribute($name) ? $element->getAttribute($name) : '';
- }
-
/**
* @return array
*/
@@ -2269,16 +2237,6 @@ private function presentationAttributes(DOMElement $element): array
), static fn ($value): bool => is_array($value) ? array() !== $value : '' !== trim((string) $value));
}
- private function safeAnchor(string $id): string
- {
- $id = trim($id);
- if ( '' === $id || ! preg_match('/^[A-Za-z][A-Za-z0-9_-]*$/', $id) ) {
- return '';
- }
-
- return $id;
- }
-
private function mergedPresentationStyle(DOMElement $element): string
{
$inlineStyle = $this->attr($element, 'style');
@@ -2949,44 +2907,6 @@ private function dynamicTextContent(DOMElement $element): ?string
return $this->attr($element, 'data-prefix') . $value . $this->attr($element, 'data-suffix');
}
- private function hasClass(DOMElement $element, string $className): bool
- {
- return in_array($className, preg_split('/\s+/', trim($this->attr($element, 'class'))) ?: array(), true);
- }
-
- private function elementSelector(DOMElement $element): string
- {
- $parts = array();
- $current = $element;
- while ( $current instanceof DOMElement && 'body' !== strtolower($current->tagName) ) {
- $tagName = strtolower($current->tagName);
- $index = 1;
- for ( $sibling = $current->previousSibling; $sibling instanceof DOMNode; $sibling = $sibling->previousSibling ) {
- if ( $sibling instanceof DOMElement && strtolower($sibling->tagName) === $tagName ) {
- ++$index;
- }
- }
- array_unshift($parts, $tagName . ':nth-of-type(' . $index . ')');
- $current = $current->parentNode instanceof DOMElement ? $current->parentNode : null;
- }
-
- return implode(' > ', $parts);
- }
-
- /**
- * @return array
- */
- private function htmlAttributes(DOMElement $element): array
- {
- $attributes = array();
- foreach ( $element->attributes ?? array() as $attribute ) {
- $attributes[$attribute->nodeName] = $attribute->nodeValue ?? '';
- }
-
- ksort($attributes);
- return $attributes;
- }
-
/**
* @return array
*/
@@ -3022,19 +2942,6 @@ private function sourceContext(DOMElement $element): array
), static fn (mixed $value): bool => '' !== $value && array() !== $value);
}
- /**
- * @return array
- */
- private function ancestorTags(DOMElement $element): array
- {
- $tags = array();
- for ( $parent = $element->parentNode; $parent instanceof DOMElement && 'body' !== strtolower($parent->tagName); $parent = $parent->parentNode ) {
- $tags[] = strtolower($parent->tagName);
- }
-
- return $tags;
- }
-
private function nearestPreviousHeadingText(DOMElement $element): string
{
for ( $node = $element->previousSibling; $node instanceof DOMNode; $node = $node->previousSibling ) {
@@ -3046,14 +2953,6 @@ private function nearestPreviousHeadingText(DOMElement $element): string
return '';
}
- /**
- * @return array
- */
- private function classNames(DOMElement $element): array
- {
- return array_values(array_filter(preg_split('/\s+/', trim($this->attr($element, 'class'))) ?: array()));
- }
-
/**
* @return array
*/
@@ -3200,18 +3099,6 @@ private function safeSourceFragment(DOMElement $element): string
return $html;
}
- private function childElementCount(DOMElement $element): int
- {
- $count = 0;
- foreach ( $element->childNodes as $child ) {
- if ( $child instanceof DOMElement ) {
- ++$count;
- }
- }
-
- return $count;
- }
-
/**
* @return array>
*/
@@ -3567,21 +3454,6 @@ private function materializationHintForInteractionKind(string $kind): string
};
}
- private function closestTagName(DOMElement $element): ?string
- {
- return $element->parentNode instanceof DOMElement ? strtolower($element->parentNode->tagName) : null;
- }
-
- private function firstChildElement(DOMElement $element, string $tagName): ?DOMElement
- {
- foreach ( $element->childNodes as $child ) {
- if ( $child instanceof DOMElement && strtolower($child->tagName) === $tagName ) {
- return $child;
- }
- }
- return null;
- }
-
private function figureMediaElement(DOMElement $figure, string $tagName): ?DOMElement
{
$direct = $this->firstChildElement($figure, $tagName);
@@ -3635,39 +3507,6 @@ private function figureLinkedMediaAnchor(DOMElement $figure): ?DOMElement
return $anchor instanceof DOMElement && $this->isImageOnlyAnchor($anchor) ? $anchor : null;
}
- private function onlyChildElement(DOMElement $element, string $tagName): ?DOMElement
- {
- $match = null;
- foreach ( $element->childNodes as $child ) {
- if ( XML_TEXT_NODE === $child->nodeType && '' === trim($child->textContent ?? '') ) {
- continue;
- }
-
- if ( ! $child instanceof DOMElement || strtolower($child->tagName) !== $tagName || null !== $match ) {
- return null;
- }
-
- $match = $child;
- }
-
- return $match;
- }
-
- /**
- * @param array $excludedTags
- */
- private function innerHtmlWithoutTags(DOMElement $element, array $excludedTags): string
- {
- $html = '';
- foreach ( $element->childNodes as $child ) {
- if ( $child instanceof DOMElement && in_array(strtolower($child->tagName), $excludedTags, true) ) {
- continue;
- }
- $html .= $element->ownerDocument->saveHTML($child);
- }
- return trim($html);
- }
-
private function citationFromElement(DOMElement $element): string
{
foreach ( $element->childNodes as $child ) {
@@ -3914,16 +3753,6 @@ private function listItems(DOMElement $list, array &$fallbacks): array
return $items;
}
- private function safeFallbackHtml(DOMElement $element): string
- {
- $html = preg_replace('@<(script|style)[^>]*?>.*?\\1>@si', '', $this->outerHtml($element)) ?? '';
- $html = preg_replace('/\s+on[a-z]+\s*=\s*("[^"]*"|\'[^\']*\'|[^\s>]+)/i', '', $html) ?? '';
- $html = preg_replace('/\s+(?:href|src|xlink:href)\s*=\s*("\s*javascript:[^"]*"|\'\s*javascript:[^\']*\'|javascript:[^\s>]+)/i', '', $html) ?? '';
- $html = preg_replace('/\s+srcdoc\s*=\s*("[^"]*"|\'[^\']*\'|[^\s>]+)/i', '', $html) ?? '';
-
- return trim($html);
- }
-
/**
* @return array|null
*/
@@ -4600,48 +4429,6 @@ private function safeCanvasAttributes(DOMElement $element): array
return $safe;
}
- /**
- * @return array{html: string, bytes: int, truncated: bool}
- */
- private function boundedFallbackHtml(string $html): array
- {
- $bytes = strlen($html);
- if ( $bytes > 2000 ) {
- return array(
- 'html' => substr($html, 0, 2000) . '...',
- 'bytes' => $bytes,
- 'truncated' => true,
- );
- }
-
- return array(
- 'html' => $html,
- 'bytes' => $bytes,
- 'truncated' => false,
- );
- }
-
- /**
- * @return array{text: string, bytes: int, truncated: bool}
- */
- private function boundedFallbackText(string $text): array
- {
- $bytes = strlen($text);
- if ( $bytes > 2000 ) {
- return array(
- 'text' => substr($text, 0, 2000) . '...',
- 'bytes' => $bytes,
- 'truncated' => true,
- );
- }
-
- return array(
- 'text' => $text,
- 'bytes' => $bytes,
- 'truncated' => false,
- );
- }
-
/**
* @return array
*/
@@ -5766,18 +5553,6 @@ private function looksLikeSplitLayout(DOMElement $element): bool
return (bool) preg_match('/(?:^|[\s_-])(?:split|two[\s_-]?col|media[\s_-]?text|text[\s_-]?media|feature[\s_-]?row|hero[\s_-]?(?:inner|grid|content|layout)|content[\s_-]?grid)(?:$|[\s_-])/', $name);
}
- private function directElementChildCount(DOMElement $element): int
- {
- $count = 0;
- foreach ( $element->childNodes as $child ) {
- if ( $child instanceof DOMElement ) {
- ++$count;
- }
- }
-
- return $count;
- }
-
private function looksLikeDocumentationLayout(DOMElement $element): bool
{
$name = strtolower(trim($this->attr($element, 'class') . ' ' . $this->attr($element, 'id')));
@@ -6565,20 +6340,6 @@ private function nonCoreImageFigureClassName(DOMElement $figure): string
return implode(' ', $classes);
}
- private function mergeClassNames(string ...$classNames): string
- {
- $classes = array();
- foreach ( $classNames as $className ) {
- foreach ( preg_split('/\s+/', trim($className)) ?: array() as $class ) {
- if ( '' !== $class && ! in_array($class, $classes, true) ) {
- $classes[] = $class;
- }
- }
- }
-
- return implode(' ', $classes);
- }
-
/**
* @return array
*/
@@ -6687,15 +6448,6 @@ private function sanitizedSyntaxHtml(DOMElement $element): string
/**
* @param array $attrs
*/
- private function htmlAttributeString(array $attrs): string
- {
- $html = '';
- foreach ( $attrs as $name => $value ) {
- $html .= ' ' . $name . '="' . htmlspecialchars($value, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8') . '"';
- }
- return $html;
- }
-
/**
* @return array>
*/
diff --git a/php-transformer/src/HtmlToBlocks/Support/DomHelpersTrait.php b/php-transformer/src/HtmlToBlocks/Support/DomHelpersTrait.php
new file mode 100644
index 00000000..8fc1e149
--- /dev/null
+++ b/php-transformer/src/HtmlToBlocks/Support/DomHelpersTrait.php
@@ -0,0 +1,269 @@
+runtime->stripAllTags($label), ENT_QUOTES | ENT_HTML5, 'UTF-8')) ?? $label);
+ }
+
+ private function innerHtml(DOMElement $element): string
+ {
+ $html = '';
+ foreach ( $element->childNodes as $child ) {
+ $html .= $element->ownerDocument->saveHTML($child);
+ }
+
+ return trim($html);
+ }
+
+ private function innerHtmlPreservingWhitespace(DOMElement $element): string
+ {
+ $html = '';
+ foreach ( $element->childNodes as $child ) {
+ $html .= $element->ownerDocument->saveHTML($child);
+ }
+
+ return $html;
+ }
+
+ private function outerHtml(DOMElement $element): string
+ {
+ return trim($element->ownerDocument->saveHTML($element) ?: '');
+ }
+
+ private function attr(DOMElement $element, string $name): string
+ {
+ return $element->hasAttribute($name) ? $element->getAttribute($name) : '';
+ }
+
+ private function safeAnchor(string $id): string
+ {
+ $id = trim($id);
+ if ( '' === $id || ! preg_match('/^[A-Za-z][A-Za-z0-9_-]*$/', $id) ) {
+ return '';
+ }
+
+ return $id;
+ }
+
+ private function hasClass(DOMElement $element, string $className): bool
+ {
+ return in_array($className, preg_split('/\s+/', trim($this->attr($element, 'class'))) ?: array(), true);
+ }
+
+ private function elementSelector(DOMElement $element): string
+ {
+ $parts = array();
+ $current = $element;
+ while ( $current instanceof DOMElement && 'body' !== strtolower($current->tagName) ) {
+ $tagName = strtolower($current->tagName);
+ $index = 1;
+ for ( $sibling = $current->previousSibling; $sibling instanceof DOMNode; $sibling = $sibling->previousSibling ) {
+ if ( $sibling instanceof DOMElement && strtolower($sibling->tagName) === $tagName ) {
+ ++$index;
+ }
+ }
+ array_unshift($parts, $tagName . ':nth-of-type(' . $index . ')');
+ $current = $current->parentNode instanceof DOMElement ? $current->parentNode : null;
+ }
+
+ return implode(' > ', $parts);
+ }
+
+ /**
+ * @return array
+ */
+ private function htmlAttributes(DOMElement $element): array
+ {
+ $attributes = array();
+ foreach ( $element->attributes ?? array() as $attribute ) {
+ $attributes[$attribute->nodeName] = $attribute->nodeValue ?? '';
+ }
+
+ ksort($attributes);
+ return $attributes;
+ }
+
+ /**
+ * @return array
+ */
+ private function ancestorTags(DOMElement $element): array
+ {
+ $tags = array();
+ for ( $parent = $element->parentNode; $parent instanceof DOMElement && 'body' !== strtolower($parent->tagName); $parent = $parent->parentNode ) {
+ $tags[] = strtolower($parent->tagName);
+ }
+
+ return $tags;
+ }
+
+ /**
+ * @return array
+ */
+ private function classNames(DOMElement $element): array
+ {
+ return array_values(array_filter(preg_split('/\s+/', trim($this->attr($element, 'class'))) ?: array()));
+ }
+
+ private function childElementCount(DOMElement $element): int
+ {
+ $count = 0;
+ foreach ( $element->childNodes as $child ) {
+ if ( $child instanceof DOMElement ) {
+ ++$count;
+ }
+ }
+
+ return $count;
+ }
+
+ private function closestTagName(DOMElement $element): ?string
+ {
+ return $element->parentNode instanceof DOMElement ? strtolower($element->parentNode->tagName) : null;
+ }
+
+ private function firstChildElement(DOMElement $element, string $tagName): ?DOMElement
+ {
+ foreach ( $element->childNodes as $child ) {
+ if ( $child instanceof DOMElement && strtolower($child->tagName) === $tagName ) {
+ return $child;
+ }
+ }
+ return null;
+ }
+
+ private function onlyChildElement(DOMElement $element, string $tagName): ?DOMElement
+ {
+ $match = null;
+ foreach ( $element->childNodes as $child ) {
+ if ( XML_TEXT_NODE === $child->nodeType && '' === trim($child->textContent ?? '') ) {
+ continue;
+ }
+
+ if ( ! $child instanceof DOMElement || strtolower($child->tagName) !== $tagName || null !== $match ) {
+ return null;
+ }
+
+ $match = $child;
+ }
+
+ return $match;
+ }
+
+ /**
+ * @param array $excludedTags
+ */
+ private function innerHtmlWithoutTags(DOMElement $element, array $excludedTags): string
+ {
+ $html = '';
+ foreach ( $element->childNodes as $child ) {
+ if ( $child instanceof DOMElement && in_array(strtolower($child->tagName), $excludedTags, true) ) {
+ continue;
+ }
+ $html .= $element->ownerDocument->saveHTML($child);
+ }
+ return trim($html);
+ }
+
+ private function safeFallbackHtml(DOMElement $element): string
+ {
+ $html = preg_replace('@<(script|style)[^>]*?>.*?\\1>@si', '', $this->outerHtml($element)) ?? '';
+ $html = preg_replace('/\s+on[a-z]+\s*=\s*("[^"]*"|\'[^\']*\'|[^\s>]+)/i', '', $html) ?? '';
+ $html = preg_replace('/\s+(?:href|src|xlink:href)\s*=\s*("\s*javascript:[^"]*"|\'\s*javascript:[^\']*\'|javascript:[^\s>]+)/i', '', $html) ?? '';
+ $html = preg_replace('/\s+srcdoc\s*=\s*("[^"]*"|\'[^\']*\'|[^\s>]+)/i', '', $html) ?? '';
+
+ return trim($html);
+ }
+
+ /**
+ * @return array{html: string, bytes: int, truncated: bool}
+ */
+ private function boundedFallbackHtml(string $html): array
+ {
+ $bytes = strlen($html);
+ if ( $bytes > 2000 ) {
+ return array(
+ 'html' => substr($html, 0, 2000) . '...',
+ 'bytes' => $bytes,
+ 'truncated' => true,
+ );
+ }
+
+ return array(
+ 'html' => $html,
+ 'bytes' => $bytes,
+ 'truncated' => false,
+ );
+ }
+
+ /**
+ * @return array{text: string, bytes: int, truncated: bool}
+ */
+ private function boundedFallbackText(string $text): array
+ {
+ $bytes = strlen($text);
+ if ( $bytes > 2000 ) {
+ return array(
+ 'text' => substr($text, 0, 2000) . '...',
+ 'bytes' => $bytes,
+ 'truncated' => true,
+ );
+ }
+
+ return array(
+ 'text' => $text,
+ 'bytes' => $bytes,
+ 'truncated' => false,
+ );
+ }
+
+ private function directElementChildCount(DOMElement $element): int
+ {
+ $count = 0;
+ foreach ( $element->childNodes as $child ) {
+ if ( $child instanceof DOMElement ) {
+ ++$count;
+ }
+ }
+
+ return $count;
+ }
+
+ private function mergeClassNames(string ...$classNames): string
+ {
+ $classes = array();
+ foreach ( $classNames as $className ) {
+ foreach ( preg_split('/\s+/', trim($className)) ?: array() as $class ) {
+ if ( '' !== $class && ! in_array($class, $classes, true) ) {
+ $classes[] = $class;
+ }
+ }
+ }
+
+ return implode(' ', $classes);
+ }
+
+ private function htmlAttributeString(array $attrs): string
+ {
+ $html = '';
+ foreach ( $attrs as $name => $value ) {
+ $html .= ' ' . $name . '="' . htmlspecialchars($value, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8') . '"';
+ }
+ return $html;
+ }
+}