Tim Starling has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/335367 )

Change subject: Improve DOMBuilder and add tests
......................................................................

Improve DOMBuilder and add tests

* Move DOM-related classes to a separate namespace
* Introduce DOMSerializer which is a TreeHandler interchangeable with
  Serializer, allowing comparative testing
* Make the Formatter implementations also be able to serialize DOM
  subtrees. This allows DOM serialization to share the non-static
  member data such as void element lists with the tree mutation stream
  serializer.
* Run the html5lib tests on the DOM serializer. This requires
  blacklisting a few tests due to validation done in libxml2.

Change-Id: I4fc50a155f1f94c9f6bfdd888d0d2aebfc43637d
---
M bin/test.php
A src/DOM/DOMBuilder.php
A src/DOM/DOMFormatter.php
A src/DOM/DOMSerializer.php
A src/Serializer/AbstractSerializer.php
M src/Serializer/HtmlFormatter.php
M src/Serializer/Serializer.php
M src/Serializer/TestFormatter.php
D src/TreeBuilder/DOMBuilder.php
M tests/phpunit/TreeBuilderTest.php
10 files changed, 664 insertions(+), 179 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/libs/RemexHtml 
refs/changes/67/335367/1

diff --git a/bin/test.php b/bin/test.php
index 3708a35..f0dfbea 100755
--- a/bin/test.php
+++ b/bin/test.php
@@ -7,6 +7,7 @@
 
 require __DIR__ . '/../vendor/autoload.php';
 
+use RemexHtml\DOM;
 use RemexHtml\Tokenizer;
 use RemexHtml\TreeBuilder;
 use RemexHtml\Serializer;
@@ -41,7 +42,7 @@
 function reserialize( $text ) {
        $handler = new Tokenizer\TokenSerializer;
        $tokenizer = new Tokenizer\Tokenizer( $handler, $text, [] );
-       $tokenizer->execute();
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
        print $handler->getOutput() . "\n";
        foreach ( $handler->getErrors() as $error ) {
                print "Error at {$error[1]}: {$error[0]}\n";
@@ -77,10 +78,7 @@
        $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
        $dispatchTracer = new TreeBuilder\DispatchTracer( $text, $dispatcher, 
$traceCallback );
        $tokenizer = new Tokenizer\Tokenizer( $dispatchTracer, $text, [] );
-       $tokenizer->execute( [
-               // 'fragmentNamespace' => \RemexHtml\HTMLData::NS_HTML,
-               // 'fragmentName' => 'html'
-       ] );
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
 
        print $serializer->getResult() . "\n";
 }
@@ -95,10 +93,7 @@
        $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
        $dispatchTracer = new TreeBuilder\DispatchTracer( $text, $dispatcher, 
$traceCallback );
        $tokenizer = new Tokenizer\Tokenizer( $dispatchTracer, $text, [] );
-       $tokenizer->execute( [
-               // 'fragmentNamespace' => \RemexHtml\HTMLData::NS_HTML,
-               // 'fragmentName' => 'html'
-       ] );
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
 }
 
 function tidy( $text ) {
@@ -110,7 +105,48 @@
        $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
        $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
        $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, 
$GLOBALS['tokenizerOptions'] );
-       $tokenizer->execute();
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
+       print $serializer->getResult() . "\n";
+}
+
+function test( $text ) {
+       $error = function ( $msg, $pos ) {
+               print "  *  [$pos] $msg\n";
+       };
+       $formatter = new Serializer\TestFormatter;
+       $serializer = new Serializer\Serializer( $formatter, $error );
+       $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
+       $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
+       $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, 
$GLOBALS['tokenizerOptions'] );
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
+       print $serializer->getResult() . "\n";
+}
+
+function tidyViaDOM( $text ) {
+       $error = function ( $msg, $pos ) {
+               print "  *  [$pos] $msg\n";
+       };
+       $formatter = new Serializer\HtmlFormatter;
+       $domBuilder = new DOM\DOMBuilder( $error );
+       $serializer = new DOM\DOMSerializer( $domBuilder, $formatter );
+       $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
+       $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
+       $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, [] );
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
+       print $serializer->getResult() . "\n";
+}
+
+function testViaDOM( $text ) {
+       $error = function ( $msg, $pos ) {
+               print "  *  [$pos] $msg\n";
+       };
+       $formatter = new Serializer\TestFormatter;
+       $domBuilder = new DOM\DOMBuilder( $error );
+       $serializer = new DOM\DOMSerializer( $domBuilder, $formatter );
+       $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
+       $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
+       $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, [] );
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
        print $serializer->getResult() . "\n";
 }
 
@@ -118,10 +154,7 @@
        $time = -microtime( true );
        $handler = new NullHandler;
        $tokenizer = new Tokenizer\Tokenizer( $handler, $text, 
$GLOBALS['tokenizerOptions'] );
-       $tokenizer->execute( [
-               //'state' => Tokenizer\Tokenizer::STATE_SCRIPT_DATA,
-               //'appropriateEndTag' => 'script'
-       ] );
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
        $time += microtime( true );
        print "$time\n";
 }
@@ -130,7 +163,7 @@
        $time = -microtime( true );
        $handler = new Tokenizer\TokenSerializer;
        $tokenizer = new Tokenizer\Tokenizer( $handler, $text, 
$GLOBALS['tokenizerOptions'] );
-       $tokenizer->execute();
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
        $time += microtime( true );
        print "$time\n";
 }
@@ -141,18 +174,18 @@
        $treeBuilder = new TreeBuilder\TreeBuilder( $handler, [] );
        $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
        $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, 
$GLOBALS['tokenizerOptions'] );
-       $tokenizer->execute();
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
        $time += microtime( true );
        print "$time\n";
 }
 
 function benchmarkDOM( $text ) {
        $time = -microtime( true );
-       $domBuilder = new TreeBuilder\DOMBuilder;
+       $domBuilder = new DOM\DOMBuilder;
        $treeBuilder = new TreeBuilder\TreeBuilder( $domBuilder, [ 
'ignoreErrors' => true ] );
        $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
        $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, 
$GLOBALS['tokenizerOptions'] );
-       $tokenizer->execute();
+       $tokenizer->execute( $GLOBALS['executeOptions'] );
        $time += microtime( true );
        print "$time\n";
 }
@@ -166,7 +199,7 @@
                $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
                $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
                $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, 
$GLOBALS['tokenizerOptions'] );
-               $tokenizer->execute();
+               $tokenizer->execute( $GLOBALS['executeOptions'] );
        }
        $time += microtime( true );
        print ( $time / $n ) . "\n";
@@ -181,7 +214,7 @@
                $treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
                $dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
                $tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, [] );
-               $tokenizer->execute();
+               $tokenizer->execute( $GLOBALS['executeOptions'] );
        }
        $time += microtime( true );
        print ( $time / $n ) . "\n";
@@ -214,6 +247,10 @@
        'ignoreErrors' => true,
        'skipPreprocess' => true,
 ];
+$executeOptions = [
+       // 'fragmentNamespace' => \RemexHtml\HTMLData::NS_HTML,
+       // 'fragmentName' => 'div'
+];
 $text = file_get_contents( '/tmp/Australia.html' );
 
 while ( ( $__line = readline( "> " ) ) !== false ) {
diff --git a/src/DOM/DOMBuilder.php b/src/DOM/DOMBuilder.php
new file mode 100644
index 0000000..5519e17
--- /dev/null
+++ b/src/DOM/DOMBuilder.php
@@ -0,0 +1,195 @@
+<?php
+
+namespace RemexHtml\DOM;
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\TreeBuilder\Element;
+use RemexHtml\TreeBuilder\TreeBuilder;
+use RemexHtml\TreeBuilder\TreeHandler;
+
+/**
+ * A TreeHandler which constructs a DOMDocument
+ */
+class DOMBuilder implements TreeHandler {
+       private $doc;
+       private $errorCallback;
+       private $isFragment;
+
+       public $doctypeName;
+       public $public;
+       public $system;
+       public $quirks;
+
+       /**
+        * @param callable|null $errorCallback A function which is called on 
parse errors
+        */
+       public function __construct( $errorCallback = null ) {
+               $this->errorCallback = $errorCallback;
+       }
+
+       /**
+        * Get the constructed document or document fragment. In the fragment 
case,
+        * a DOMElement is returned, and the caller is expected to extract its
+        * inner contents, ignoring the wrapping element. This convention is
+        * convenient because the wrapping element gives libxml somewhere to put
+        * its namespace declarations. If we copied the children into a
+        * DOMDocumentFragment, libxml would invent new prefixes for the 
orphaned
+        * namespaces.
+        *
+        * @return DOMNode
+        */
+       public function getFragment() {
+               if ( $this->isFragment ) {
+                       return $this->doc->documentElement;
+               } else {
+                       return $this->doc;
+               }
+       }
+
+       public function startDocument( $fragmentNamespace, $fragmentName ) {
+               $impl = new \DOMImplementation;
+               $this->isFragment = $fragmentNamespace !== null;
+               $this->doc = $this->createDocument();
+       }
+
+       private function createDocument( $doctypeName = null, $public = null, 
$system = null ) {
+               $impl = new \DOMImplementation;
+               if ( $doctypeName === null
+                       || $doctypeName === '' // libxml limitation, causes 
test failures
+               ) {
+                       $doc = $impl->createDocument( null, null );
+               } else {
+                       $doctype = $impl->createDocumentType( $doctypeName, 
$public, $system );
+                       $doc = $impl->createDocument( null, null, $doctype );
+               }
+               $doc->encoding = 'UTF-8';
+               return $doc;
+       }
+
+       public function endDocument( $pos ) {
+       }
+
+       private function insertNode( $preposition, $refElement, $node ) {
+               if ( $preposition === TreeBuilder::ROOT ) {
+                       $parent = $this->doc;
+                       $refNode = null;
+               } elseif ( $preposition === TreeBuilder::BEFORE ) {
+                       $parent = $refElement->userData->parentNode;
+                       $refNode = $refElement->userData;
+               } else {
+                       $parent = $refElement->userData;
+                       $refNode = null;
+               }
+               $parent->insertBefore( $node, $refNode );
+       }
+
+       private function createNode( Element $element ) {
+               $node = $this->doc->createElementNS(
+                       $element->namespace,
+                       $element->name );
+
+               foreach ( $element->attrs->getObjects() as $attr ) {
+                       if ( $attr->namespaceURI === null
+                               && strpos( $attr->localName, ':' ) !== false
+                       ) {
+                               // FIXME: this apparently works to create a 
prefixed localName
+                               // in the null namespace, but this is probably 
taking advantage
+                               // of a bug in PHP's DOM library, and screws up 
in various
+                               // interesting ways. For example, attributes 
created in this
+                               // way can't be discovered via hasAttribute() 
or hasAttributeNS().
+                               $attrNode = $this->doc->createAttribute( 
$attr->localName );
+                               $attrNode->value = $attr->value;
+                               $node->setAttributeNodeNS( $attrNode );
+                       } else {
+                               $node->setAttributeNS(
+                                       $attr->namespaceURI,
+                                       $attr->qualifiedName,
+                                       $attr->value );
+                       }
+               }
+               $element->userData = $node;
+               return $node;
+       }
+
+       public function characters( $preposition, $refElement, $text, $start, 
$length,
+               $sourceStart, $sourceLength
+       ) {
+               $node = $this->doc->createTextNode( substr( $text, $start, 
$length ) );
+               $this->insertNode( $preposition, $refElement, $node );
+       }
+
+       public function insertElement( $preposition, $refElement, Element 
$element, $void,
+               $sourceStart, $sourceLength
+       ) {
+               if ( $element->userData ) {
+                       $node = $element->userData;
+               } else {
+                       $node = $this->createNode( $element );
+               }
+               $this->insertNode( $preposition, $refElement, $node );
+       }
+
+       public function endTag( Element $element, $sourceStart, $sourceLength ) 
{
+       }
+
+       public function doctype( $name, $public, $system, $quirks, 
$sourceStart, $sourceLength ) {
+               if ( !$this->doc->firstChild ) {
+                       $impl = $this->doc->implementation;
+                       $this->doc = $this->createDocument( $name, $public, 
$system );
+               }
+               $this->doctypeName = $name;
+               $this->public = $public;
+               $this->system = $system;
+               $this->quirks = $quirks;
+       }
+
+       public function comment( $preposition, $refElement, $text, 
$sourceStart, $sourceLength ) {
+               $node = $this->doc->createComment( $text );
+               $this->insertNode( $preposition, $refElement, $node );
+       }
+
+       public function error( $text, $pos ) {
+               if ( $this->errorCallback ) {
+                       call_user_func( $this->errorCallback, $text, $pos );
+               }
+       }
+
+       public function mergeAttributes( Element $element, Attributes $attrs, 
$sourceStart ) {
+               $node = $element->userData;
+               foreach ( $attrs->getObjects() as $name => $attr ) {
+                       if ( $attr->namespaceURI === null
+                               && strpos( $attr->localName, ':' ) !== false
+                       ) {
+                               // As noted in createNode(), we can't use 
hasAttribute() here.
+                               // However, we can use the return value of 
setAttributeNodeNS()
+                               // instead.
+                               $attrNode = $this->doc->createAttribute( 
$attr->localName );
+                               $attrNode->value = $attr->value;
+                               $replaced = $node->setAttributeNodeNS( 
$attrNode );
+                               if ( $replaced ) {
+                                       // Put it back how it was
+                                       $node->setAttributeNodeNS( $replaced );
+                               }
+                       } elseif ( $attr->namespaceURI === null ) {
+                               if ( !$node->hasAttribute( $attr->localName ) ) 
{
+                                       $node->setAttribute( $attr->localName, 
$attr->value );
+                               }
+                       } elseif ( !$node->hasAttributeNS( $attr->namespaceURI, 
$attr->localName ) ) {
+                               $node->setAttributeNS( $attr->namespaceURI, 
$attr->localName, $attr->value );
+                       }
+               }
+       }
+
+       public function removeNode( Element $element, $sourceStart ) {
+               $node = $element->userData;
+               $node->parentNode->removeChild( $node );
+       }
+
+       public function reparentChildren( Element $element, Element $newParent, 
$sourceStart ) {
+               $this->insertElement( TreeBuilder::UNDER, $element, $newParent, 
false, $sourceStart, 0 );
+               $node = $element->userData;
+               $newParentNode = $newParent->userData;
+               while ( $node->firstChild !== $newParentNode ) {
+                       $newParentNode->appendChild( $node->firstChild );
+               }
+       }
+}
diff --git a/src/DOM/DOMFormatter.php b/src/DOM/DOMFormatter.php
new file mode 100644
index 0000000..dad8a7d
--- /dev/null
+++ b/src/DOM/DOMFormatter.php
@@ -0,0 +1,21 @@
+<?php
+
+namespace RemexHtml\DOM;
+
+interface DOMFormatter {
+       /**
+        * Recursively format a DOMNode.
+        *
+        * @param DOMNode $node The node to format
+        * @return string
+        */
+       function formatDOMNode( \DOMNode $node );
+
+       /**
+        * Non-recursively format a DOMElement.
+        *
+        * @param DOMElement $Element The element to format
+        * @param string $contents The formatted contents of the element
+        */
+       function formatDOMElement( \DOMElement $element, $contents );
+}
diff --git a/src/DOM/DOMSerializer.php b/src/DOM/DOMSerializer.php
new file mode 100644
index 0000000..8f60265
--- /dev/null
+++ b/src/DOM/DOMSerializer.php
@@ -0,0 +1,86 @@
+<?php
+
+namespace RemexHtml\DOM;
+use RemexHtml\Serializer\AbstractSerializer;
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\TreeBuilder\Element;
+
+/**
+ * This class providers a Serializer-like interface to DOMBuilder, allowing
+ * DOMBuilder and direct serialization to be used interchangeably.
+ *
+ * HtmlFormatter::formatDOMNode() can be used directly if this interface is
+ * not required.
+ */
+class DOMSerializer implements AbstractSerializer {
+       private $formatter;
+
+       /**
+        * Constructor
+        *
+        * @param DOMFormatter $formatter This may be, for example, an 
HtmlFormatter object
+        */
+       public function __construct( DOMBuilder $builder, DOMFormatter 
$formatter ) {
+               $this->builder = $builder;
+               $this->formatter = $formatter;
+       }
+
+       public function getResult() {
+               $fragment = $this->builder->getFragment();
+               $s = '';
+               foreach ( $fragment->childNodes as $child ) {
+                       $s .= $this->formatter->formatDOMNode( $child );
+               }
+               return $s;
+       }
+
+       public function startDocument( $fragmentNamespace, $fragmentName ) {
+               $this->builder->startDocument( $fragmentNamespace, 
$fragmentName );
+       }
+
+       public function endDocument( $pos ) {
+               $this->builder->endDocument( $pos );
+       }
+
+       public function characters( $preposition, $refElement, $text, $start, 
$length,
+               $sourceStart, $sourceLength
+       ) {
+               $this->builder->characters( $preposition, $refElement, $text, 
$start, $length,
+                       $sourceStart, $sourceLength );
+       }
+
+       public function insertElement( $preposition, $refElement, Element 
$element, $void,
+               $sourceStart, $sourceLength
+       ) {
+               $this->builder->insertElement( $preposition, $refElement, 
$element, $void,
+                       $sourceStart, $sourceLength );
+       }
+
+       public function endTag( Element $element, $sourceStart, $sourceLength ) 
{
+               $this->builder->endTag( $element, $sourceStart, $sourceLength );
+       }
+
+       public function doctype( $name, $public, $system, $quirks, 
$sourceStart, $sourceLength ) {
+               $this->builder->doctype( $name, $public, $system, $quirks, 
$sourceStart, $sourceLength );
+       }
+
+       public function comment( $preposition, $refElement, $text, 
$sourceStart, $sourceLength ) {
+               $this->builder->comment( $preposition, $refElement, $text, 
$sourceStart, $sourceLength );
+       }
+
+       public function error( $text, $pos ) {
+               $this->builder->error( $text, $pos );
+       }
+
+       public function mergeAttributes( Element $element, Attributes $attrs, 
$sourceStart ) {
+               $this->builder->mergeAttributes( $element, $attrs, $sourceStart 
);
+       }
+
+       public function removeNode( Element $element, $sourceStart ) {
+               $this->builder->removeNode( $element, $sourceStart );
+       }
+
+       public function reparentChildren( Element $element, Element $newParent, 
$sourceStart ) {
+               $this->builder->reparentChildren( $element, $newParent, 
$sourceStart );
+       }
+}
diff --git a/src/Serializer/AbstractSerializer.php 
b/src/Serializer/AbstractSerializer.php
new file mode 100644
index 0000000..0243a7d
--- /dev/null
+++ b/src/Serializer/AbstractSerializer.php
@@ -0,0 +1,12 @@
+<?php
+namespace RemexHtml\Serializer;
+use RemexHtml\TreeBuilder\TreeHandler;
+
+interface AbstractSerializer extends TreeHandler {
+       /**
+        * Get the serialized result of tree construction
+        *
+        * @return string
+        */
+       function getResult();
+}
diff --git a/src/Serializer/HtmlFormatter.php b/src/Serializer/HtmlFormatter.php
index 40e3a89..3dbf594 100644
--- a/src/Serializer/HtmlFormatter.php
+++ b/src/Serializer/HtmlFormatter.php
@@ -2,11 +2,12 @@
 
 namespace RemexHtml\Serializer;
 use RemexHtml\HTMLData;
+use RemexHtml\DOM\DOMFormatter;
 
 /**
  * A formatter which follows the HTML 5 fragment serialization algorithm.
  */
-class HtmlFormatter implements Formatter {
+class HtmlFormatter implements Formatter, DOMFormatter {
        /**
         * The elements for which a closing tag is omitted.
         */
@@ -73,28 +74,37 @@
        ];
 
        /**
-        * The scripting flag, which is true if scripting is enabled. This 
influences
-        * <noscript> serialization.
+        * Attribute namespaces which have unqualified local names
         */
-       protected $scriptingFlag;
+       protected $unqualifiedNamespaces = [
+               HTMLData::NS_HTML => true,
+               HTMLData::NS_MATHML => true,
+               HTMLData::NS_SVG => true,
+       ];
+
+       protected $useSourceDoctype;
 
        /**
         * Constructor.
         *
         * @param array $options An associative array of options:
         *   - scriptingFlag : Set this to false to disable scripting. True by 
default.
+        *   - useSourceDoctype : Emit the doctype used in the source. If this 
is
+        *     false or absent, an HTML doctype will be used.
         */
        public function __construct( $options = [] ) {
                $options += [
-                       'scriptingFlag' => true
+                       'scriptingFlag' => true,
+                       'useSourceDoctype' => false,
                ];
                if ( $options['scriptingFlag'] ) {
                        $this->rawTextElements['noscript'] = true;
                }
+               $this->useSourceDoctype = $options['useSourceDoctype'];
        }
 
        public function startDocument( $fragmentNamespace, $fragmentName ) {
-               return "<!DOCTYPE html>\n";
+               return "<!DOCTYPE html>";
        }
 
        public function characters( SerializerNode $parent, $text, $start, 
$length ) {
@@ -136,4 +146,114 @@
        public function doctype( $name, $public, $system ) {
                return '';
        }
+
+       public function formatDOMNode( \DOMNode $node ) {
+               $contents = '';
+               if ( $node->firstChild ) {
+                       foreach ( $node->childNodes as $child ) {
+                               $contents .= $this->formatDOMNode( $child );
+                       }
+               }
+
+               switch ( $node->nodeType ) {
+               case XML_ELEMENT_NODE:
+                       return $this->formatDOMElement( $node, $contents );
+
+               case XML_DOCUMENT_NODE:
+                       if ( !$this->useSourceDoctype ) {
+                               return "<!DOCTYPE html>" . $contents;
+                       } else {
+                               return $contents;
+                       }
+
+               case XML_DOCUMENT_FRAG_NODE:
+                       return $contents;
+
+               case XML_TEXT_NODE:
+                       $text = $node->data;
+                       $parent = $node->parentNode;
+                       if ( $parent->namespaceURI !== HTMLData::NS_HTML
+                               || !isset( 
$this->rawTextElements[$parent->nodeName] )
+                       ) {
+                               $text = strtr( $text, $this->textEscapes );
+                       }
+                       return $text;
+
+               case XML_CDATA_SECTION_NODE:
+                       $parent = $node->parentNode;
+                       if ( $parent->namespaceURI === HTMLData::NS_HTML ) {
+                               // CDATA is not allowed in HTML nodes
+                               return $node->data;
+                       } else {
+                               return "<![CDATA[{$node->data}]]>";
+                       }
+
+               case XML_PI_NODE:
+                       return "<?{$node->target} {$node->data}>";
+
+               case XML_COMMENT_NODE:
+                       return "<{$node->data}>";
+
+               case XML_DOCUMENT_TYPE_NODE:
+                       if ( $this->useSourceDoctype ) {
+                               return "<!DOCTYPE {$node->name}>";
+                       } else {
+                               return '';
+                       }
+
+               default:
+                       return '';
+               }
+       }
+
+       public function formatDOMElement( \DOMElement $node, $contents ) {
+               $ns = $node->namespaceURI;
+               if ( $ns === null
+                       || isset( $this->unqualifiedNamespaces[$ns] )
+                       || $node->prefix === null
+               ) {
+                       $name = $node->localName;
+               } else {
+                       $name = $node->prefix . ':' . $node->localName;
+               }
+               $s = '<' . $name;
+               foreach ( $node->attributes as $attr ) {
+                       switch ( $attr->namespaceURI ) {
+                       case HTMLData::NS_XML:
+                               $attrName = 'xml:' . $attr->localName;
+                               break;
+                       case HTMLData::NS_XMLNS:
+                               if ( $attr->localName === 'xmlns' ) {
+                                       $attrName = 'xmlns';
+                               } else {
+                                       $attrName = 'xmlns:' . $attr->localName;
+                               }
+                               break;
+                       case HTMLData::NS_XLINK:
+                               $attrName = 'xlink:' . $attr->localName;
+                               break;
+                       default:
+                               if ( strlen( $attr->prefix ) ) {
+                                       $attrName = $attr->prefix . ':' . 
$attr->localName;
+                               } else {
+                                       $attrName = $attr->localName;
+                               }
+                       }
+                       $encValue = strtr( $attr->value, 
$this->attributeEscapes );
+                       $s .= " $attrName=\"$encValue\"";
+               }
+               $s .= '>';
+               if ( $ns === HTMLData::NS_HTML ) {
+                       if ( isset( $contents[0] ) && $contents[0] === "\n"
+                               && isset( $this->prefixLfElements[$name] )
+                       ) {
+                               $s .= "\n$contents</$name>";
+                       } elseif ( !isset( $this->voidElements[$name] ) ) {
+                               $s .= "$contents</$name>";
+                       }
+               } else {
+                       $s .= "$contents</$name>";
+               }
+               return $s;
+       }
 }
diff --git a/src/Serializer/Serializer.php b/src/Serializer/Serializer.php
index d5150f3..a31adeb 100644
--- a/src/Serializer/Serializer.php
+++ b/src/Serializer/Serializer.php
@@ -12,7 +12,7 @@
  * encoding elements when the end tags are seen. This is faster than building
  * a DOM and then serializing it, even if you use DOMDocument::saveHTML().
  */
-class Serializer implements TreeHandler {
+class Serializer implements AbstractSerializer {
        /**
         * A node corresponding to the Document
         * @var SerializerNode
diff --git a/src/Serializer/TestFormatter.php b/src/Serializer/TestFormatter.php
index fe33293..65af2b3 100644
--- a/src/Serializer/TestFormatter.php
+++ b/src/Serializer/TestFormatter.php
@@ -1,20 +1,29 @@
 <?php
 
 namespace RemexHtml\Serializer;
+use RemexHtml\Tokenizer\Attribute;
 use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\Tokenizer\PlainAttributes;
 use RemexHtml\HTMLData;
+use RemexHtml\DOM\DOMFormatter;
 
 /**
  * A Formatter which is used to format documents in (almost) the way they
  * appear in the html5lib tests. A little bit of post-processing is required
- * in the PHPUnit test.
+ * in the PHPUnit tests.
  */
-class TestFormatter implements Formatter {
-       function startDocument( $fragmentNamespace, $fragmentName ) {
+class TestFormatter implements Formatter, DOMFormatter {
+       private static $attrNamespaces = [
+               HTMLData::NS_XML => 'xml',
+               HTMLData::NS_XLINK => 'xlink',
+               HTMLData::NS_XMLNS => 'xmlns',
+       ];
+
+       public function startDocument( $fragmentNamespace, $fragmentName ) {
                return '';
        }
 
-       function doctype( $name, $public, $system ) {
+       public function doctype( $name, $public, $system ) {
                $ret = "<!DOCTYPE $name";
                if ( $public !== '' || $system !== '' ) {
                        $ret .= " \"$public\" \"$system\"";
@@ -23,15 +32,22 @@
                return $ret;
        }
 
-       function characters( SerializerNode $parent, $text, $start, $length ) {
+       public function characters( SerializerNode $parent, $text, $start, 
$length ) {
+               return $this->formatCharacters( substr( $text, $start, $length 
) );
+       }
+
+       private function formatCharacters( $text ) {
                return '"' .
-                       str_replace( "\n", "<EOL>", substr( $text, $start, 
$length ) ) .
+                       str_replace( "\n", "<EOL>", $text ) .
                        "\"\n";
        }
 
-       function element( SerializerNode $parent, SerializerNode $node, 
$contents ) {
-               $namespace = $node->namespace;
-               $name = $node->name;
+       public function element( SerializerNode $parent, SerializerNode $node, 
$contents ) {
+               return $this->formatElement( $node->namespace, $node->name,
+                       $node->attrs->getObjects(), $contents );
+       }
+
+       private function formatElement( $namespace, $name, $attrs, $contents ) {
                if ( $namespace === HTMLData::NS_HTML ) {
                        $tagName = $name;
                } elseif ( $namespace === HTMLData::NS_SVG ) {
@@ -42,14 +58,17 @@
                        $tagName = $name;
                }
                $ret = "<$tagName>\n";
-               $sortedAttrs = $node->attrs->getObjects();
+               $sortedAttrs = $attrs;
                ksort( $sortedAttrs, SORT_STRING );
                foreach ( $sortedAttrs as $attrName => $attr ) {
-                       if ( $attr->prefix !== null ) {
-                               $ret .= "  {$attr->prefix} 
{$attr->localName}=\"{$attr->value}\"\n";
-                       } else {
-                               $ret .= "  $attrName=\"{$attr->value}\"\n";
+                       if ( $attr->namespaceURI === null
+                               || isset( $attr->reallyNoNamespace )
+                       ) {
+                               $prefix = '';
+                       } elseif ( isset( 
self::$attrNamespaces[$attr->namespaceURI] ) ) {
+                               $prefix = 
self::$attrNamespaces[$attr->namespaceURI] . ' ';
                        }
+                       $ret .= "  
$prefix{$attr->localName}=\"{$attr->value}\"\n";
                }
                if ( $contents !== null && $contents !== '' ) {
                        $contents = preg_replace( '/^/m', '  ', $contents );
@@ -67,7 +86,80 @@
                return $ret;
        }
 
-       function comment( SerializerNode $parent, $text ) {
+       public function comment( SerializerNode $parent, $text ) {
+               return $this->formatComment( $text );
+       }
+
+       private function formatComment( $text ) {
                return "<!-- $text -->\n";
        }
+
+       public function formatDOMNode( \DOMNode $node ) {
+               $contents = '';
+               if ( $node->firstChild ) {
+                       foreach ( $node->childNodes as $child ) {
+                               $contents .= $this->formatDOMNode( $child );
+                       }
+               }
+
+               switch ( $node->nodeType ) {
+               case XML_ELEMENT_NODE:
+                       return $this->formatDOMElement( $node, $contents );
+
+               case XML_DOCUMENT_NODE:
+               case XML_DOCUMENT_FRAG_NODE:
+                       return $contents;
+
+               case XML_TEXT_NODE:
+               case XML_CDATA_SECTION_NODE:
+                       return $this->formatCharacters( $node->data );
+
+               case XML_COMMENT_NODE:
+                       return $this->formatComment( $node->data );
+
+               case XML_DOCUMENT_TYPE_NODE:
+                       return $this->doctype( $node->name, $node->publicId, 
$node->systemId );
+
+               case XML_PI_NODE:
+               default:
+                       return '';
+               }
+       }
+
+       public function formatDOMElement( \DOMElement $node, $content ) {
+               $attrs = [];
+               foreach ( $node->attributes as $attr ) {
+                       $prefix = null;
+                       switch ( $attr->namespaceURI ) {
+                       case HTMLData::NS_XML:
+                               $prefix = 'xml';
+                               $qName = 'xml:' . $attr->localName;
+                               break;
+                       case HTMLData::NS_XMLNS:
+                               if ( $attr->localName === 'xmlns' ) {
+                                       $qName = 'xmlns';
+                               } else {
+                                       $prefix = 'xmlns';
+                                       $qName = 'xmlns:' . $attr->localName;
+                               }
+                               break;
+                       case HTMLData::NS_XLINK:
+                               $prefix = 'xlink';
+                               $qName = 'xlink:' . $attr->localName;
+                               break;
+                       default:
+                               if ( strlen( $attr->prefix ) ) {
+                                       $qName = $attr->prefix . ':' . 
$attr->localName;
+                               } else {
+                                       $prefix = $attr->prefix;
+                                       $qName = $attr->localName;
+                               }
+                       }
+
+                       $attrs[$qName] = new Attribute( $qName, 
$attr->namespaceURI, $prefix,
+                               $attr->localName, $attr->value );
+               }
+
+               return $this->formatElement( $node->namespaceURI, 
$node->nodeName, $attrs, $content );
+       }
 }
diff --git a/src/TreeBuilder/DOMBuilder.php b/src/TreeBuilder/DOMBuilder.php
deleted file mode 100644
index ccf36f5..0000000
--- a/src/TreeBuilder/DOMBuilder.php
+++ /dev/null
@@ -1,127 +0,0 @@
-<?php
-
-namespace RemexHtml\TreeBuilder;
-use RemexHtml\Tokenizer\Attributes;
-
-/**
- * A TreeHandler which constructs a DOMDocument
- */
-class DOMBuilder implements TreeHandler {
-       private $doc;
-       private $errorCallback;
-
-       /**
-        * @param callable|null $errorCallback A function which is called on 
parse errors
-        */
-       public function __construct( $errorCallback = null ) {
-               $this->errorCallback = $errorCallback;
-       }
-
-       /**
-        * Get the constructed document
-        * @return DOMDocument
-        */
-       public function getDocument() {
-               return $this->doc;
-       }
-
-       public function startDocument( $fns, $fn ) {
-               $this->doc = new \DOMDocument;
-       }
-
-       public function endDocument( $pos ) {
-       }
-
-       private function insertNode( $preposition, $refElement, $node ) {
-               if ( $preposition === TreeBuilder::ROOT ) {
-                       $parent = $this->doc;
-                       $refNode = null;
-               } elseif ( $preposition === TreeBuilder::BEFORE ) {
-                       $parent = $refElement->userData->parentNode;
-                       $refNode = $refElement->userData;
-               } else {
-                       $parent = $refElement->userData;
-                       $refNode = null;
-               }
-               $parent->insertBefore( $node, $refNode );
-       }
-
-       private function createNode( Element $element ) {
-               $node = $this->doc->createElementNS(
-                       $element->namespace,
-                       $element->name );
-
-               foreach ( $element->getAttributeObjects() as $attr ) {
-                       if ( $attr->namespaceURI !== null ) {
-                               $node->setAttributeNS(
-                                       $attr->namespaceURI,
-                                       $attr->qualifiedName,
-                                       $attr->value );
-                       } else {
-                               $node->setAttribute( $attr->localName, 
$attr->value );
-                       }
-               }
-               $element->userData = $node;
-               return $node;
-       }
-
-       public function characters( $preposition, $refElement, $text, $start, 
$length,
-               $sourceStart, $sourceLength
-       ) {
-               $node = $this->doc->createTextNode( substr( $text, $start, 
$length ) );
-               $this->insertNode( $preposition, $refElement, $node );
-       }
-
-       public function insertElement( $preposition, $refElement, Element 
$element, $void,
-               $sourceStart, $sourceLength
-       ) {
-               if ( $element->userData ) {
-                       $node = $element->userData;
-               } else {
-                       $node = $this->createNode( $element );
-               }
-               $this->insertNode( $preposition, $refElement, $node );
-       }
-
-       public function endTag( Element $element, $sourceStart, $sourceLength ) 
{
-       }
-
-       public function doctype( $name, $public, $system, $quirks, 
$sourceStart, $sourceLength ) {
-       }
-
-       public function comment( $preposition, $refElement, $text, 
$sourceStart, $sourceLength ) {
-               $node = $this->doc->createComment( $text );
-               $this->insertNode( $preposition, $refElement, $node );
-       }
-
-       public function error( $text, $pos ) {
-               if ( $this->errorCallback ) {
-                       call_user_func( $this->errorCallback, $text, $pos );
-               }
-       }
-
-       public function mergeAttributes( Element $element, Attributes $attrs, 
$sourceStart ) {
-               $node = $element->userData;
-               foreach ( $attrs->getValues() as $name => $value ) {
-                       if ( !$node->hasAttribute( $name ) ) {
-                               $node->setAttribute( $name, $value );
-                       }
-               }
-       }
-
-       public function removeNode( Element $element, $sourceStart ) {
-               $node = $element->userData;
-               $node->parent->removeChild( $node );
-       }
-
-       public function reparentChildren( Element $element, Element $newParent, 
$sourceStart ) {
-               $this->insertElement( TreeBuilder::UNDER, $element, $newParent, 
false, $sourceStart, 0 );
-               $node = $element->userData;
-               $newParentNode = $newParent->userData;
-               foreach ( $node->childNodes as $child ) {
-                       if ( $child !== $newParentNode ) {
-                               $newParentNode->appendChild( $child );
-                       }
-               }
-       }
-}
diff --git a/tests/phpunit/TreeBuilderTest.php 
b/tests/phpunit/TreeBuilderTest.php
index 24052b0..bb4cc1a 100644
--- a/tests/phpunit/TreeBuilderTest.php
+++ b/tests/phpunit/TreeBuilderTest.php
@@ -1,6 +1,7 @@
 <?php
 
 namespace RemexHtml\TreeBuilder;
+use RemexHtml\DOM;
 use RemexHtml\HTMLData;
 use RemexHtml\Tokenizer;
 use RemexHtml\Serializer;
@@ -19,7 +20,36 @@
        private static $testBlacklist = [
        ];
 
-       public function provider() {
+       private static $domTestBlacklist = [
+               // Invalid tag name
+               'tree-construction/html5test-com.dat:1',
+               'tree-construction/webkit01.dat:179',
+
+               // Invalid attribute name
+               'tree-construction/html5test-com.dat:12',
+               'tree-construction/html5test-com.dat:39',
+               'tree-construction/tests14.dat:45',
+               'tree-construction/tests14.dat:55',
+               'tree-construction/tests14.dat:67',
+               'tree-construction/tests26.dat:263',
+               'tree-construction/webkit01.dat:606',
+
+               // Invalid doctype
+               'tree-construction/doctype01.dat:32',
+               'tree-construction/doctype01.dat:45',
+               'tree-construction/tests6.dat:48',
+       ];
+
+       public function serializerProvider() {
+               return $this->provider( 'serializer' );
+       }
+
+       public function domProvider() {
+               return $this->provider( 'dom' );
+       }
+
+
+       private function provider( $type ) {
                $testFiles = [];
                foreach ( self::$testDirs as $testDir ) {
                        $testFiles = array_merge( $testFiles, glob( __DIR__ . 
"/../$testDir/*.dat" ) );
@@ -29,7 +59,7 @@
                        if ( in_array( 'tree-construction/' . basename( 
$fileName ), self::$fileBlacklist ) ) {
                                continue;
                        }
-                       $tests = $this->readFile( $fileName );
+                       $tests = $this->readFile( $fileName, $type );
 
                        foreach ( $tests as $test ) {
                                if ( isset( $test['scripting'] ) ) {
@@ -45,7 +75,7 @@
                return $args;
        }
 
-       private function readFile( $fileName ) {
+       private function readFile( $fileName, $type ) {
                $text = file_get_contents( $fileName );
                if ( $text === false ) {
                        throw new \Exception( "Cannot read test file: 
$fileName" );
@@ -97,10 +127,17 @@
                                        break;
                                }
                        } while ( !$section['end'] );
-                       
-                       if ( !in_array( "$baseName:$startLine", 
self::$testBlacklist ) ) {
-                               $tests[] = $test;
+
+                       if ( in_array( "$baseName:$startLine", 
self::$testBlacklist ) ) {
+                               continue;
                        }
+                       if ( $type === 'dom'
+                               && in_array( "$baseName:$startLine", 
self::$domTestBlacklist )
+                       ) {
+                               continue;
+                       }
+                       
+                       $tests[] = $test;
                }
                return $tests;
        }
@@ -159,13 +196,25 @@
                return $result;
        }
 
-       /** @dataProvider provider */
-       public function testDefault( $params ) {
+       /** @dataProvider serializerProvider */
+       public function testSerializer( $params ) {
+               $formatter = new Serializer\TestFormatter;
+               $serializer = new Serializer\Serializer( $formatter );
+               $this->runWithSerializer( $serializer, $params );
+       }
+
+       /** @dataProvider domProvider */
+       public function testDOMSerializer( $params ) {
+               $formatter = new Serializer\TestFormatter;
+               $builder = new DOM\DOMBuilder;
+               $serializer = new DOM\DOMSerializer( $builder, $formatter );
+               $this->runWithSerializer( $serializer, $params );
+       }
+
+       private function runWithSerializer( Serializer\AbstractSerializer 
$serializer, $params ) {
                if ( !isset( $params['document'] ) ) {
                        throw new \Exception( "Test lacks #document: 
{$params['file']}:{$params['line']}" );
                }
-               $formatter = new Serializer\TestFormatter;
-               $serializer = new Serializer\Serializer( $formatter );
                $treeBuilder = new TreeBuilder( $serializer, [
                        'scriptingFlag' => $params['scripting']
                ] );

-- 
To view, visit https://gerrit.wikimedia.org/r/335367
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4fc50a155f1f94c9f6bfdd888d0d2aebfc43637d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/libs/RemexHtml
Gerrit-Branch: master
Gerrit-Owner: Tim Starling <tstarl...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to