Tim Starling has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/335367 )
Change subject: Improve DOMBuilder and add tests ...................................................................... Improve DOMBuilder and add tests * Move DOM-related classes to a separate namespace * Introduce DOMSerializer which is a TreeHandler interchangeable with Serializer, allowing comparative testing * Make the Formatter implementations also be able to serialize DOM subtrees. This allows DOM serialization to share the non-static member data such as void element lists with the tree mutation stream serializer. * Run the html5lib tests on the DOM serializer. This requires blacklisting a few tests due to validation done in libxml2. Change-Id: I4fc50a155f1f94c9f6bfdd888d0d2aebfc43637d --- M bin/test.php A src/DOM/DOMBuilder.php A src/DOM/DOMFormatter.php A src/DOM/DOMSerializer.php A src/Serializer/AbstractSerializer.php M src/Serializer/HtmlFormatter.php M src/Serializer/Serializer.php M src/Serializer/TestFormatter.php D src/TreeBuilder/DOMBuilder.php M tests/phpunit/TreeBuilderTest.php 10 files changed, 664 insertions(+), 179 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/libs/RemexHtml refs/changes/67/335367/1 diff --git a/bin/test.php b/bin/test.php index 3708a35..f0dfbea 100755 --- a/bin/test.php +++ b/bin/test.php @@ -7,6 +7,7 @@ require __DIR__ . '/../vendor/autoload.php'; +use RemexHtml\DOM; use RemexHtml\Tokenizer; use RemexHtml\TreeBuilder; use RemexHtml\Serializer; @@ -41,7 +42,7 @@ function reserialize( $text ) { $handler = new Tokenizer\TokenSerializer; $tokenizer = new Tokenizer\Tokenizer( $handler, $text, [] ); - $tokenizer->execute(); + $tokenizer->execute( $GLOBALS['executeOptions'] ); print $handler->getOutput() . "\n"; foreach ( $handler->getErrors() as $error ) { print "Error at {$error[1]}: {$error[0]}\n"; @@ -77,10 +78,7 @@ $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); $dispatchTracer = new TreeBuilder\DispatchTracer( $text, $dispatcher, $traceCallback ); $tokenizer = new Tokenizer\Tokenizer( $dispatchTracer, $text, [] ); - $tokenizer->execute( [ - // 'fragmentNamespace' => \RemexHtml\HTMLData::NS_HTML, - // 'fragmentName' => 'html' - ] ); + $tokenizer->execute( $GLOBALS['executeOptions'] ); print $serializer->getResult() . "\n"; } @@ -95,10 +93,7 @@ $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); $dispatchTracer = new TreeBuilder\DispatchTracer( $text, $dispatcher, $traceCallback ); $tokenizer = new Tokenizer\Tokenizer( $dispatchTracer, $text, [] ); - $tokenizer->execute( [ - // 'fragmentNamespace' => \RemexHtml\HTMLData::NS_HTML, - // 'fragmentName' => 'html' - ] ); + $tokenizer->execute( $GLOBALS['executeOptions'] ); } function tidy( $text ) { @@ -110,7 +105,48 @@ $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] ); $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] ); - $tokenizer->execute(); + $tokenizer->execute( $GLOBALS['executeOptions'] ); + print $serializer->getResult() . "\n"; +} + +function test( $text ) { + $error = function ( $msg, $pos ) { + print " * [$pos] $msg\n"; + }; + $formatter = new Serializer\TestFormatter; + $serializer = new Serializer\Serializer( $formatter, $error ); + $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] ); + $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); + $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] ); + $tokenizer->execute( $GLOBALS['executeOptions'] ); + print $serializer->getResult() . "\n"; +} + +function tidyViaDOM( $text ) { + $error = function ( $msg, $pos ) { + print " * [$pos] $msg\n"; + }; + $formatter = new Serializer\HtmlFormatter; + $domBuilder = new DOM\DOMBuilder( $error ); + $serializer = new DOM\DOMSerializer( $domBuilder, $formatter ); + $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] ); + $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); + $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, [] ); + $tokenizer->execute( $GLOBALS['executeOptions'] ); + print $serializer->getResult() . "\n"; +} + +function testViaDOM( $text ) { + $error = function ( $msg, $pos ) { + print " * [$pos] $msg\n"; + }; + $formatter = new Serializer\TestFormatter; + $domBuilder = new DOM\DOMBuilder( $error ); + $serializer = new DOM\DOMSerializer( $domBuilder, $formatter ); + $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] ); + $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); + $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, [] ); + $tokenizer->execute( $GLOBALS['executeOptions'] ); print $serializer->getResult() . "\n"; } @@ -118,10 +154,7 @@ $time = -microtime( true ); $handler = new NullHandler; $tokenizer = new Tokenizer\Tokenizer( $handler, $text, $GLOBALS['tokenizerOptions'] ); - $tokenizer->execute( [ - //'state' => Tokenizer\Tokenizer::STATE_SCRIPT_DATA, - //'appropriateEndTag' => 'script' - ] ); + $tokenizer->execute( $GLOBALS['executeOptions'] ); $time += microtime( true ); print "$time\n"; } @@ -130,7 +163,7 @@ $time = -microtime( true ); $handler = new Tokenizer\TokenSerializer; $tokenizer = new Tokenizer\Tokenizer( $handler, $text, $GLOBALS['tokenizerOptions'] ); - $tokenizer->execute(); + $tokenizer->execute( $GLOBALS['executeOptions'] ); $time += microtime( true ); print "$time\n"; } @@ -141,18 +174,18 @@ $treeBuilder = new TreeBuilder\TreeBuilder( $handler, [] ); $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] ); - $tokenizer->execute(); + $tokenizer->execute( $GLOBALS['executeOptions'] ); $time += microtime( true ); print "$time\n"; } function benchmarkDOM( $text ) { $time = -microtime( true ); - $domBuilder = new TreeBuilder\DOMBuilder; + $domBuilder = new DOM\DOMBuilder; $treeBuilder = new TreeBuilder\TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] ); $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] ); - $tokenizer->execute(); + $tokenizer->execute( $GLOBALS['executeOptions'] ); $time += microtime( true ); print "$time\n"; } @@ -166,7 +199,7 @@ $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] ); $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] ); - $tokenizer->execute(); + $tokenizer->execute( $GLOBALS['executeOptions'] ); } $time += microtime( true ); print ( $time / $n ) . "\n"; @@ -181,7 +214,7 @@ $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] ); $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder ); $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, [] ); - $tokenizer->execute(); + $tokenizer->execute( $GLOBALS['executeOptions'] ); } $time += microtime( true ); print ( $time / $n ) . "\n"; @@ -214,6 +247,10 @@ 'ignoreErrors' => true, 'skipPreprocess' => true, ]; +$executeOptions = [ + // 'fragmentNamespace' => \RemexHtml\HTMLData::NS_HTML, + // 'fragmentName' => 'div' +]; $text = file_get_contents( '/tmp/Australia.html' ); while ( ( $__line = readline( "> " ) ) !== false ) { diff --git a/src/DOM/DOMBuilder.php b/src/DOM/DOMBuilder.php new file mode 100644 index 0000000..5519e17 --- /dev/null +++ b/src/DOM/DOMBuilder.php @@ -0,0 +1,195 @@ +<?php + +namespace RemexHtml\DOM; +use RemexHtml\Tokenizer\Attributes; +use RemexHtml\TreeBuilder\Element; +use RemexHtml\TreeBuilder\TreeBuilder; +use RemexHtml\TreeBuilder\TreeHandler; + +/** + * A TreeHandler which constructs a DOMDocument + */ +class DOMBuilder implements TreeHandler { + private $doc; + private $errorCallback; + private $isFragment; + + public $doctypeName; + public $public; + public $system; + public $quirks; + + /** + * @param callable|null $errorCallback A function which is called on parse errors + */ + public function __construct( $errorCallback = null ) { + $this->errorCallback = $errorCallback; + } + + /** + * Get the constructed document or document fragment. In the fragment case, + * a DOMElement is returned, and the caller is expected to extract its + * inner contents, ignoring the wrapping element. This convention is + * convenient because the wrapping element gives libxml somewhere to put + * its namespace declarations. If we copied the children into a + * DOMDocumentFragment, libxml would invent new prefixes for the orphaned + * namespaces. + * + * @return DOMNode + */ + public function getFragment() { + if ( $this->isFragment ) { + return $this->doc->documentElement; + } else { + return $this->doc; + } + } + + public function startDocument( $fragmentNamespace, $fragmentName ) { + $impl = new \DOMImplementation; + $this->isFragment = $fragmentNamespace !== null; + $this->doc = $this->createDocument(); + } + + private function createDocument( $doctypeName = null, $public = null, $system = null ) { + $impl = new \DOMImplementation; + if ( $doctypeName === null + || $doctypeName === '' // libxml limitation, causes test failures + ) { + $doc = $impl->createDocument( null, null ); + } else { + $doctype = $impl->createDocumentType( $doctypeName, $public, $system ); + $doc = $impl->createDocument( null, null, $doctype ); + } + $doc->encoding = 'UTF-8'; + return $doc; + } + + public function endDocument( $pos ) { + } + + private function insertNode( $preposition, $refElement, $node ) { + if ( $preposition === TreeBuilder::ROOT ) { + $parent = $this->doc; + $refNode = null; + } elseif ( $preposition === TreeBuilder::BEFORE ) { + $parent = $refElement->userData->parentNode; + $refNode = $refElement->userData; + } else { + $parent = $refElement->userData; + $refNode = null; + } + $parent->insertBefore( $node, $refNode ); + } + + private function createNode( Element $element ) { + $node = $this->doc->createElementNS( + $element->namespace, + $element->name ); + + foreach ( $element->attrs->getObjects() as $attr ) { + if ( $attr->namespaceURI === null + && strpos( $attr->localName, ':' ) !== false + ) { + // FIXME: this apparently works to create a prefixed localName + // in the null namespace, but this is probably taking advantage + // of a bug in PHP's DOM library, and screws up in various + // interesting ways. For example, attributes created in this + // way can't be discovered via hasAttribute() or hasAttributeNS(). + $attrNode = $this->doc->createAttribute( $attr->localName ); + $attrNode->value = $attr->value; + $node->setAttributeNodeNS( $attrNode ); + } else { + $node->setAttributeNS( + $attr->namespaceURI, + $attr->qualifiedName, + $attr->value ); + } + } + $element->userData = $node; + return $node; + } + + public function characters( $preposition, $refElement, $text, $start, $length, + $sourceStart, $sourceLength + ) { + $node = $this->doc->createTextNode( substr( $text, $start, $length ) ); + $this->insertNode( $preposition, $refElement, $node ); + } + + public function insertElement( $preposition, $refElement, Element $element, $void, + $sourceStart, $sourceLength + ) { + if ( $element->userData ) { + $node = $element->userData; + } else { + $node = $this->createNode( $element ); + } + $this->insertNode( $preposition, $refElement, $node ); + } + + public function endTag( Element $element, $sourceStart, $sourceLength ) { + } + + public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { + if ( !$this->doc->firstChild ) { + $impl = $this->doc->implementation; + $this->doc = $this->createDocument( $name, $public, $system ); + } + $this->doctypeName = $name; + $this->public = $public; + $this->system = $system; + $this->quirks = $quirks; + } + + public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) { + $node = $this->doc->createComment( $text ); + $this->insertNode( $preposition, $refElement, $node ); + } + + public function error( $text, $pos ) { + if ( $this->errorCallback ) { + call_user_func( $this->errorCallback, $text, $pos ); + } + } + + public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) { + $node = $element->userData; + foreach ( $attrs->getObjects() as $name => $attr ) { + if ( $attr->namespaceURI === null + && strpos( $attr->localName, ':' ) !== false + ) { + // As noted in createNode(), we can't use hasAttribute() here. + // However, we can use the return value of setAttributeNodeNS() + // instead. + $attrNode = $this->doc->createAttribute( $attr->localName ); + $attrNode->value = $attr->value; + $replaced = $node->setAttributeNodeNS( $attrNode ); + if ( $replaced ) { + // Put it back how it was + $node->setAttributeNodeNS( $replaced ); + } + } elseif ( $attr->namespaceURI === null ) { + if ( !$node->hasAttribute( $attr->localName ) ) { + $node->setAttribute( $attr->localName, $attr->value ); + } + } elseif ( !$node->hasAttributeNS( $attr->namespaceURI, $attr->localName ) ) { + $node->setAttributeNS( $attr->namespaceURI, $attr->localName, $attr->value ); + } + } + } + + public function removeNode( Element $element, $sourceStart ) { + $node = $element->userData; + $node->parentNode->removeChild( $node ); + } + + public function reparentChildren( Element $element, Element $newParent, $sourceStart ) { + $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 ); + $node = $element->userData; + $newParentNode = $newParent->userData; + while ( $node->firstChild !== $newParentNode ) { + $newParentNode->appendChild( $node->firstChild ); + } + } +} diff --git a/src/DOM/DOMFormatter.php b/src/DOM/DOMFormatter.php new file mode 100644 index 0000000..dad8a7d --- /dev/null +++ b/src/DOM/DOMFormatter.php @@ -0,0 +1,21 @@ +<?php + +namespace RemexHtml\DOM; + +interface DOMFormatter { + /** + * Recursively format a DOMNode. + * + * @param DOMNode $node The node to format + * @return string + */ + function formatDOMNode( \DOMNode $node ); + + /** + * Non-recursively format a DOMElement. + * + * @param DOMElement $Element The element to format + * @param string $contents The formatted contents of the element + */ + function formatDOMElement( \DOMElement $element, $contents ); +} diff --git a/src/DOM/DOMSerializer.php b/src/DOM/DOMSerializer.php new file mode 100644 index 0000000..8f60265 --- /dev/null +++ b/src/DOM/DOMSerializer.php @@ -0,0 +1,86 @@ +<?php + +namespace RemexHtml\DOM; +use RemexHtml\Serializer\AbstractSerializer; +use RemexHtml\Tokenizer\Attributes; +use RemexHtml\TreeBuilder\Element; + +/** + * This class providers a Serializer-like interface to DOMBuilder, allowing + * DOMBuilder and direct serialization to be used interchangeably. + * + * HtmlFormatter::formatDOMNode() can be used directly if this interface is + * not required. + */ +class DOMSerializer implements AbstractSerializer { + private $formatter; + + /** + * Constructor + * + * @param DOMFormatter $formatter This may be, for example, an HtmlFormatter object + */ + public function __construct( DOMBuilder $builder, DOMFormatter $formatter ) { + $this->builder = $builder; + $this->formatter = $formatter; + } + + public function getResult() { + $fragment = $this->builder->getFragment(); + $s = ''; + foreach ( $fragment->childNodes as $child ) { + $s .= $this->formatter->formatDOMNode( $child ); + } + return $s; + } + + public function startDocument( $fragmentNamespace, $fragmentName ) { + $this->builder->startDocument( $fragmentNamespace, $fragmentName ); + } + + public function endDocument( $pos ) { + $this->builder->endDocument( $pos ); + } + + public function characters( $preposition, $refElement, $text, $start, $length, + $sourceStart, $sourceLength + ) { + $this->builder->characters( $preposition, $refElement, $text, $start, $length, + $sourceStart, $sourceLength ); + } + + public function insertElement( $preposition, $refElement, Element $element, $void, + $sourceStart, $sourceLength + ) { + $this->builder->insertElement( $preposition, $refElement, $element, $void, + $sourceStart, $sourceLength ); + } + + public function endTag( Element $element, $sourceStart, $sourceLength ) { + $this->builder->endTag( $element, $sourceStart, $sourceLength ); + } + + public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { + $this->builder->doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ); + } + + public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) { + $this->builder->comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ); + } + + public function error( $text, $pos ) { + $this->builder->error( $text, $pos ); + } + + public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) { + $this->builder->mergeAttributes( $element, $attrs, $sourceStart ); + } + + public function removeNode( Element $element, $sourceStart ) { + $this->builder->removeNode( $element, $sourceStart ); + } + + public function reparentChildren( Element $element, Element $newParent, $sourceStart ) { + $this->builder->reparentChildren( $element, $newParent, $sourceStart ); + } +} diff --git a/src/Serializer/AbstractSerializer.php b/src/Serializer/AbstractSerializer.php new file mode 100644 index 0000000..0243a7d --- /dev/null +++ b/src/Serializer/AbstractSerializer.php @@ -0,0 +1,12 @@ +<?php +namespace RemexHtml\Serializer; +use RemexHtml\TreeBuilder\TreeHandler; + +interface AbstractSerializer extends TreeHandler { + /** + * Get the serialized result of tree construction + * + * @return string + */ + function getResult(); +} diff --git a/src/Serializer/HtmlFormatter.php b/src/Serializer/HtmlFormatter.php index 40e3a89..3dbf594 100644 --- a/src/Serializer/HtmlFormatter.php +++ b/src/Serializer/HtmlFormatter.php @@ -2,11 +2,12 @@ namespace RemexHtml\Serializer; use RemexHtml\HTMLData; +use RemexHtml\DOM\DOMFormatter; /** * A formatter which follows the HTML 5 fragment serialization algorithm. */ -class HtmlFormatter implements Formatter { +class HtmlFormatter implements Formatter, DOMFormatter { /** * The elements for which a closing tag is omitted. */ @@ -73,28 +74,37 @@ ]; /** - * The scripting flag, which is true if scripting is enabled. This influences - * <noscript> serialization. + * Attribute namespaces which have unqualified local names */ - protected $scriptingFlag; + protected $unqualifiedNamespaces = [ + HTMLData::NS_HTML => true, + HTMLData::NS_MATHML => true, + HTMLData::NS_SVG => true, + ]; + + protected $useSourceDoctype; /** * Constructor. * * @param array $options An associative array of options: * - scriptingFlag : Set this to false to disable scripting. True by default. + * - useSourceDoctype : Emit the doctype used in the source. If this is + * false or absent, an HTML doctype will be used. */ public function __construct( $options = [] ) { $options += [ - 'scriptingFlag' => true + 'scriptingFlag' => true, + 'useSourceDoctype' => false, ]; if ( $options['scriptingFlag'] ) { $this->rawTextElements['noscript'] = true; } + $this->useSourceDoctype = $options['useSourceDoctype']; } public function startDocument( $fragmentNamespace, $fragmentName ) { - return "<!DOCTYPE html>\n"; + return "<!DOCTYPE html>"; } public function characters( SerializerNode $parent, $text, $start, $length ) { @@ -136,4 +146,114 @@ public function doctype( $name, $public, $system ) { return ''; } + + public function formatDOMNode( \DOMNode $node ) { + $contents = ''; + if ( $node->firstChild ) { + foreach ( $node->childNodes as $child ) { + $contents .= $this->formatDOMNode( $child ); + } + } + + switch ( $node->nodeType ) { + case XML_ELEMENT_NODE: + return $this->formatDOMElement( $node, $contents ); + + case XML_DOCUMENT_NODE: + if ( !$this->useSourceDoctype ) { + return "<!DOCTYPE html>" . $contents; + } else { + return $contents; + } + + case XML_DOCUMENT_FRAG_NODE: + return $contents; + + case XML_TEXT_NODE: + $text = $node->data; + $parent = $node->parentNode; + if ( $parent->namespaceURI !== HTMLData::NS_HTML + || !isset( $this->rawTextElements[$parent->nodeName] ) + ) { + $text = strtr( $text, $this->textEscapes ); + } + return $text; + + case XML_CDATA_SECTION_NODE: + $parent = $node->parentNode; + if ( $parent->namespaceURI === HTMLData::NS_HTML ) { + // CDATA is not allowed in HTML nodes + return $node->data; + } else { + return "<![CDATA[{$node->data}]]>"; + } + + case XML_PI_NODE: + return "<?{$node->target} {$node->data}>"; + + case XML_COMMENT_NODE: + return "<{$node->data}>"; + + case XML_DOCUMENT_TYPE_NODE: + if ( $this->useSourceDoctype ) { + return "<!DOCTYPE {$node->name}>"; + } else { + return ''; + } + + default: + return ''; + } + } + + public function formatDOMElement( \DOMElement $node, $contents ) { + $ns = $node->namespaceURI; + if ( $ns === null + || isset( $this->unqualifiedNamespaces[$ns] ) + || $node->prefix === null + ) { + $name = $node->localName; + } else { + $name = $node->prefix . ':' . $node->localName; + } + $s = '<' . $name; + foreach ( $node->attributes as $attr ) { + switch ( $attr->namespaceURI ) { + case HTMLData::NS_XML: + $attrName = 'xml:' . $attr->localName; + break; + case HTMLData::NS_XMLNS: + if ( $attr->localName === 'xmlns' ) { + $attrName = 'xmlns'; + } else { + $attrName = 'xmlns:' . $attr->localName; + } + break; + case HTMLData::NS_XLINK: + $attrName = 'xlink:' . $attr->localName; + break; + default: + if ( strlen( $attr->prefix ) ) { + $attrName = $attr->prefix . ':' . $attr->localName; + } else { + $attrName = $attr->localName; + } + } + $encValue = strtr( $attr->value, $this->attributeEscapes ); + $s .= " $attrName=\"$encValue\""; + } + $s .= '>'; + if ( $ns === HTMLData::NS_HTML ) { + if ( isset( $contents[0] ) && $contents[0] === "\n" + && isset( $this->prefixLfElements[$name] ) + ) { + $s .= "\n$contents</$name>"; + } elseif ( !isset( $this->voidElements[$name] ) ) { + $s .= "$contents</$name>"; + } + } else { + $s .= "$contents</$name>"; + } + return $s; + } } diff --git a/src/Serializer/Serializer.php b/src/Serializer/Serializer.php index d5150f3..a31adeb 100644 --- a/src/Serializer/Serializer.php +++ b/src/Serializer/Serializer.php @@ -12,7 +12,7 @@ * encoding elements when the end tags are seen. This is faster than building * a DOM and then serializing it, even if you use DOMDocument::saveHTML(). */ -class Serializer implements TreeHandler { +class Serializer implements AbstractSerializer { /** * A node corresponding to the Document * @var SerializerNode diff --git a/src/Serializer/TestFormatter.php b/src/Serializer/TestFormatter.php index fe33293..65af2b3 100644 --- a/src/Serializer/TestFormatter.php +++ b/src/Serializer/TestFormatter.php @@ -1,20 +1,29 @@ <?php namespace RemexHtml\Serializer; +use RemexHtml\Tokenizer\Attribute; use RemexHtml\Tokenizer\Attributes; +use RemexHtml\Tokenizer\PlainAttributes; use RemexHtml\HTMLData; +use RemexHtml\DOM\DOMFormatter; /** * A Formatter which is used to format documents in (almost) the way they * appear in the html5lib tests. A little bit of post-processing is required - * in the PHPUnit test. + * in the PHPUnit tests. */ -class TestFormatter implements Formatter { - function startDocument( $fragmentNamespace, $fragmentName ) { +class TestFormatter implements Formatter, DOMFormatter { + private static $attrNamespaces = [ + HTMLData::NS_XML => 'xml', + HTMLData::NS_XLINK => 'xlink', + HTMLData::NS_XMLNS => 'xmlns', + ]; + + public function startDocument( $fragmentNamespace, $fragmentName ) { return ''; } - function doctype( $name, $public, $system ) { + public function doctype( $name, $public, $system ) { $ret = "<!DOCTYPE $name"; if ( $public !== '' || $system !== '' ) { $ret .= " \"$public\" \"$system\""; @@ -23,15 +32,22 @@ return $ret; } - function characters( SerializerNode $parent, $text, $start, $length ) { + public function characters( SerializerNode $parent, $text, $start, $length ) { + return $this->formatCharacters( substr( $text, $start, $length ) ); + } + + private function formatCharacters( $text ) { return '"' . - str_replace( "\n", "<EOL>", substr( $text, $start, $length ) ) . + str_replace( "\n", "<EOL>", $text ) . "\"\n"; } - function element( SerializerNode $parent, SerializerNode $node, $contents ) { - $namespace = $node->namespace; - $name = $node->name; + public function element( SerializerNode $parent, SerializerNode $node, $contents ) { + return $this->formatElement( $node->namespace, $node->name, + $node->attrs->getObjects(), $contents ); + } + + private function formatElement( $namespace, $name, $attrs, $contents ) { if ( $namespace === HTMLData::NS_HTML ) { $tagName = $name; } elseif ( $namespace === HTMLData::NS_SVG ) { @@ -42,14 +58,17 @@ $tagName = $name; } $ret = "<$tagName>\n"; - $sortedAttrs = $node->attrs->getObjects(); + $sortedAttrs = $attrs; ksort( $sortedAttrs, SORT_STRING ); foreach ( $sortedAttrs as $attrName => $attr ) { - if ( $attr->prefix !== null ) { - $ret .= " {$attr->prefix} {$attr->localName}=\"{$attr->value}\"\n"; - } else { - $ret .= " $attrName=\"{$attr->value}\"\n"; + if ( $attr->namespaceURI === null + || isset( $attr->reallyNoNamespace ) + ) { + $prefix = ''; + } elseif ( isset( self::$attrNamespaces[$attr->namespaceURI] ) ) { + $prefix = self::$attrNamespaces[$attr->namespaceURI] . ' '; } + $ret .= " $prefix{$attr->localName}=\"{$attr->value}\"\n"; } if ( $contents !== null && $contents !== '' ) { $contents = preg_replace( '/^/m', ' ', $contents ); @@ -67,7 +86,80 @@ return $ret; } - function comment( SerializerNode $parent, $text ) { + public function comment( SerializerNode $parent, $text ) { + return $this->formatComment( $text ); + } + + private function formatComment( $text ) { return "<!-- $text -->\n"; } + + public function formatDOMNode( \DOMNode $node ) { + $contents = ''; + if ( $node->firstChild ) { + foreach ( $node->childNodes as $child ) { + $contents .= $this->formatDOMNode( $child ); + } + } + + switch ( $node->nodeType ) { + case XML_ELEMENT_NODE: + return $this->formatDOMElement( $node, $contents ); + + case XML_DOCUMENT_NODE: + case XML_DOCUMENT_FRAG_NODE: + return $contents; + + case XML_TEXT_NODE: + case XML_CDATA_SECTION_NODE: + return $this->formatCharacters( $node->data ); + + case XML_COMMENT_NODE: + return $this->formatComment( $node->data ); + + case XML_DOCUMENT_TYPE_NODE: + return $this->doctype( $node->name, $node->publicId, $node->systemId ); + + case XML_PI_NODE: + default: + return ''; + } + } + + public function formatDOMElement( \DOMElement $node, $content ) { + $attrs = []; + foreach ( $node->attributes as $attr ) { + $prefix = null; + switch ( $attr->namespaceURI ) { + case HTMLData::NS_XML: + $prefix = 'xml'; + $qName = 'xml:' . $attr->localName; + break; + case HTMLData::NS_XMLNS: + if ( $attr->localName === 'xmlns' ) { + $qName = 'xmlns'; + } else { + $prefix = 'xmlns'; + $qName = 'xmlns:' . $attr->localName; + } + break; + case HTMLData::NS_XLINK: + $prefix = 'xlink'; + $qName = 'xlink:' . $attr->localName; + break; + default: + if ( strlen( $attr->prefix ) ) { + $qName = $attr->prefix . ':' . $attr->localName; + } else { + $prefix = $attr->prefix; + $qName = $attr->localName; + } + } + + $attrs[$qName] = new Attribute( $qName, $attr->namespaceURI, $prefix, + $attr->localName, $attr->value ); + } + + return $this->formatElement( $node->namespaceURI, $node->nodeName, $attrs, $content ); + } } diff --git a/src/TreeBuilder/DOMBuilder.php b/src/TreeBuilder/DOMBuilder.php deleted file mode 100644 index ccf36f5..0000000 --- a/src/TreeBuilder/DOMBuilder.php +++ /dev/null @@ -1,127 +0,0 @@ -<?php - -namespace RemexHtml\TreeBuilder; -use RemexHtml\Tokenizer\Attributes; - -/** - * A TreeHandler which constructs a DOMDocument - */ -class DOMBuilder implements TreeHandler { - private $doc; - private $errorCallback; - - /** - * @param callable|null $errorCallback A function which is called on parse errors - */ - public function __construct( $errorCallback = null ) { - $this->errorCallback = $errorCallback; - } - - /** - * Get the constructed document - * @return DOMDocument - */ - public function getDocument() { - return $this->doc; - } - - public function startDocument( $fns, $fn ) { - $this->doc = new \DOMDocument; - } - - public function endDocument( $pos ) { - } - - private function insertNode( $preposition, $refElement, $node ) { - if ( $preposition === TreeBuilder::ROOT ) { - $parent = $this->doc; - $refNode = null; - } elseif ( $preposition === TreeBuilder::BEFORE ) { - $parent = $refElement->userData->parentNode; - $refNode = $refElement->userData; - } else { - $parent = $refElement->userData; - $refNode = null; - } - $parent->insertBefore( $node, $refNode ); - } - - private function createNode( Element $element ) { - $node = $this->doc->createElementNS( - $element->namespace, - $element->name ); - - foreach ( $element->getAttributeObjects() as $attr ) { - if ( $attr->namespaceURI !== null ) { - $node->setAttributeNS( - $attr->namespaceURI, - $attr->qualifiedName, - $attr->value ); - } else { - $node->setAttribute( $attr->localName, $attr->value ); - } - } - $element->userData = $node; - return $node; - } - - public function characters( $preposition, $refElement, $text, $start, $length, - $sourceStart, $sourceLength - ) { - $node = $this->doc->createTextNode( substr( $text, $start, $length ) ); - $this->insertNode( $preposition, $refElement, $node ); - } - - public function insertElement( $preposition, $refElement, Element $element, $void, - $sourceStart, $sourceLength - ) { - if ( $element->userData ) { - $node = $element->userData; - } else { - $node = $this->createNode( $element ); - } - $this->insertNode( $preposition, $refElement, $node ); - } - - public function endTag( Element $element, $sourceStart, $sourceLength ) { - } - - public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { - } - - public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) { - $node = $this->doc->createComment( $text ); - $this->insertNode( $preposition, $refElement, $node ); - } - - public function error( $text, $pos ) { - if ( $this->errorCallback ) { - call_user_func( $this->errorCallback, $text, $pos ); - } - } - - public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) { - $node = $element->userData; - foreach ( $attrs->getValues() as $name => $value ) { - if ( !$node->hasAttribute( $name ) ) { - $node->setAttribute( $name, $value ); - } - } - } - - public function removeNode( Element $element, $sourceStart ) { - $node = $element->userData; - $node->parent->removeChild( $node ); - } - - public function reparentChildren( Element $element, Element $newParent, $sourceStart ) { - $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 ); - $node = $element->userData; - $newParentNode = $newParent->userData; - foreach ( $node->childNodes as $child ) { - if ( $child !== $newParentNode ) { - $newParentNode->appendChild( $child ); - } - } - } -} diff --git a/tests/phpunit/TreeBuilderTest.php b/tests/phpunit/TreeBuilderTest.php index 24052b0..bb4cc1a 100644 --- a/tests/phpunit/TreeBuilderTest.php +++ b/tests/phpunit/TreeBuilderTest.php @@ -1,6 +1,7 @@ <?php namespace RemexHtml\TreeBuilder; +use RemexHtml\DOM; use RemexHtml\HTMLData; use RemexHtml\Tokenizer; use RemexHtml\Serializer; @@ -19,7 +20,36 @@ private static $testBlacklist = [ ]; - public function provider() { + private static $domTestBlacklist = [ + // Invalid tag name + 'tree-construction/html5test-com.dat:1', + 'tree-construction/webkit01.dat:179', + + // Invalid attribute name + 'tree-construction/html5test-com.dat:12', + 'tree-construction/html5test-com.dat:39', + 'tree-construction/tests14.dat:45', + 'tree-construction/tests14.dat:55', + 'tree-construction/tests14.dat:67', + 'tree-construction/tests26.dat:263', + 'tree-construction/webkit01.dat:606', + + // Invalid doctype + 'tree-construction/doctype01.dat:32', + 'tree-construction/doctype01.dat:45', + 'tree-construction/tests6.dat:48', + ]; + + public function serializerProvider() { + return $this->provider( 'serializer' ); + } + + public function domProvider() { + return $this->provider( 'dom' ); + } + + + private function provider( $type ) { $testFiles = []; foreach ( self::$testDirs as $testDir ) { $testFiles = array_merge( $testFiles, glob( __DIR__ . "/../$testDir/*.dat" ) ); @@ -29,7 +59,7 @@ if ( in_array( 'tree-construction/' . basename( $fileName ), self::$fileBlacklist ) ) { continue; } - $tests = $this->readFile( $fileName ); + $tests = $this->readFile( $fileName, $type ); foreach ( $tests as $test ) { if ( isset( $test['scripting'] ) ) { @@ -45,7 +75,7 @@ return $args; } - private function readFile( $fileName ) { + private function readFile( $fileName, $type ) { $text = file_get_contents( $fileName ); if ( $text === false ) { throw new \Exception( "Cannot read test file: $fileName" ); @@ -97,10 +127,17 @@ break; } } while ( !$section['end'] ); - - if ( !in_array( "$baseName:$startLine", self::$testBlacklist ) ) { - $tests[] = $test; + + if ( in_array( "$baseName:$startLine", self::$testBlacklist ) ) { + continue; } + if ( $type === 'dom' + && in_array( "$baseName:$startLine", self::$domTestBlacklist ) + ) { + continue; + } + + $tests[] = $test; } return $tests; } @@ -159,13 +196,25 @@ return $result; } - /** @dataProvider provider */ - public function testDefault( $params ) { + /** @dataProvider serializerProvider */ + public function testSerializer( $params ) { + $formatter = new Serializer\TestFormatter; + $serializer = new Serializer\Serializer( $formatter ); + $this->runWithSerializer( $serializer, $params ); + } + + /** @dataProvider domProvider */ + public function testDOMSerializer( $params ) { + $formatter = new Serializer\TestFormatter; + $builder = new DOM\DOMBuilder; + $serializer = new DOM\DOMSerializer( $builder, $formatter ); + $this->runWithSerializer( $serializer, $params ); + } + + private function runWithSerializer( Serializer\AbstractSerializer $serializer, $params ) { if ( !isset( $params['document'] ) ) { throw new \Exception( "Test lacks #document: {$params['file']}:{$params['line']}" ); } - $formatter = new Serializer\TestFormatter; - $serializer = new Serializer\Serializer( $formatter ); $treeBuilder = new TreeBuilder( $serializer, [ 'scriptingFlag' => $params['scripting'] ] ); -- To view, visit https://gerrit.wikimedia.org/r/335367 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I4fc50a155f1f94c9f6bfdd888d0d2aebfc43637d Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/libs/RemexHtml Gerrit-Branch: master Gerrit-Owner: Tim Starling <tstarl...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits