[MediaWiki-commits] [Gerrit] mediawiki...RemexHtml[master]: Add XML infoset coercion to DOMBuilder
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/335596 ) Change subject: Add XML infoset coercion to DOMBuilder .. Add XML infoset coercion to DOMBuilder * In DOMBuilder, encode names that are invalid in XML * For performance, trigger encoding names only when an exception is caught. * Invert the encoding in TestFormatter, and optionally in HtmlFormatter * Re-enable tests that now pass Change-Id: Iddfa497e07636693baa280575b086f46adefe002 --- M src/DOM/DOMBuilder.php A src/DOM/DOMUtils.php M src/GenerateDataFiles.php M src/HTMLData.php M src/Serializer/HtmlFormatter.php M src/Serializer/TestFormatter.php M tests/phpunit/TreeBuilderTest.php 7 files changed, 277 insertions(+), 34 deletions(-) Approvals: Subramanya Sastry: Looks good to me, approved jenkins-bot: Verified diff --git a/src/DOM/DOMBuilder.php b/src/DOM/DOMBuilder.php index 5519e17..e4de294 100644 --- a/src/DOM/DOMBuilder.php +++ b/src/DOM/DOMBuilder.php @@ -10,14 +10,15 @@ * A TreeHandler which constructs a DOMDocument */ class DOMBuilder implements TreeHandler { - private $doc; - private $errorCallback; - private $isFragment; - public $doctypeName; public $public; public $system; public $quirks; + + private $doc; + private $errorCallback; + private $isFragment; + private $coerced; /** * @param callable|null $errorCallback A function which is called on parse errors @@ -45,6 +46,16 @@ } } + /** +* Returns true if the document was coerced due to libxml limitations. We +* follow HTML 5.1 § 8.2.7 "Coercing an HTML DOM into an infoset". +* +* @return bool +*/ + public function isCoerced() { + return $this->coerced; + } + public function startDocument( $fragmentNamespace, $fragmentName ) { $impl = new \DOMImplementation; $this->isFragment = $fragmentNamespace !== null; @@ -53,9 +64,10 @@ private function createDocument( $doctypeName = null, $public = null, $system = null ) { $impl = new \DOMImplementation; - if ( $doctypeName === null - || $doctypeName === '' // libxml limitation, causes test failures - ) { + if ( $doctypeName === '' ) { + $this->coerced = true; + $doc = $impl->createDocument( null, null ); + } elseif ( $doctypeName === null ) { $doc = $impl->createDocument( null, null ); } else { $doctype = $impl->createDocumentType( $doctypeName, $public, $system ); @@ -82,10 +94,31 @@ $parent->insertBefore( $node, $refNode ); } + /** +* Replace unsupported characters with a code of the form U123456. +* +* @param string $name +* @return string +*/ + private function coerceName( $name ) { + $coercedName = DOMUtils::coerceName( $name ); + if ( $name !== $coercedName ) { + $this->coerced = true; + } + return $coercedName; + } + private function createNode( Element $element ) { - $node = $this->doc->createElementNS( - $element->namespace, - $element->name ); + try { + $node = $this->doc->createElementNS( + $element->namespace, + $element->name ); + } catch ( \DOMException $e ) { + // Attempt to escape the name so that it is more acceptable + $node = $this->doc->createElementNS( + $element->namespace, + $this->coerceName( $element->name ) ); + } foreach ( $element->attrs->getObjects() as $attr ) { if ( $attr->namespaceURI === null @@ -98,12 +131,26 @@ // way can't be discovered via hasAttribute() or hasAttributeNS(). $attrNode = $this->doc->createAttribute( $attr->localName ); $attrNode->value = $attr->value; - $node->setAttributeNodeNS( $attrNode ); + try { + $node->setAttributeNodeNS( $attrNode ); + } catch ( \DOMException $e ) { + $node->setAttributeNS( + $attr->namespaceURI, + $this->coerceName( $attr->qualifiedName ), +
[MediaWiki-commits] [Gerrit] mediawiki...RemexHtml[master]: Add XML infoset coercion to DOMBuilder
Tim Starling has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/335596 ) Change subject: Add XML infoset coercion to DOMBuilder .. Add XML infoset coercion to DOMBuilder * In DOMBuilder, encode names that are invalid in XML * For performance, trigger encoding names only when an exception is caught. * Invert the encoding in TestFormatter, and optionally in HtmlFormatter * Re-enable tests that now pass Change-Id: Iddfa497e07636693baa280575b086f46adefe002 --- M src/DOM/DOMBuilder.php M src/GenerateDataFiles.php M src/HTMLData.php M src/Serializer/HtmlFormatter.php M src/Serializer/TestFormatter.php M tests/phpunit/TreeBuilderTest.php 6 files changed, 238 insertions(+), 34 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/libs/RemexHtml refs/changes/96/335596/1 diff --git a/src/DOM/DOMBuilder.php b/src/DOM/DOMBuilder.php index 5519e17..e4de294 100644 --- a/src/DOM/DOMBuilder.php +++ b/src/DOM/DOMBuilder.php @@ -10,14 +10,15 @@ * A TreeHandler which constructs a DOMDocument */ class DOMBuilder implements TreeHandler { - private $doc; - private $errorCallback; - private $isFragment; - public $doctypeName; public $public; public $system; public $quirks; + + private $doc; + private $errorCallback; + private $isFragment; + private $coerced; /** * @param callable|null $errorCallback A function which is called on parse errors @@ -45,6 +46,16 @@ } } + /** +* Returns true if the document was coerced due to libxml limitations. We +* follow HTML 5.1 § 8.2.7 "Coercing an HTML DOM into an infoset". +* +* @return bool +*/ + public function isCoerced() { + return $this->coerced; + } + public function startDocument( $fragmentNamespace, $fragmentName ) { $impl = new \DOMImplementation; $this->isFragment = $fragmentNamespace !== null; @@ -53,9 +64,10 @@ private function createDocument( $doctypeName = null, $public = null, $system = null ) { $impl = new \DOMImplementation; - if ( $doctypeName === null - || $doctypeName === '' // libxml limitation, causes test failures - ) { + if ( $doctypeName === '' ) { + $this->coerced = true; + $doc = $impl->createDocument( null, null ); + } elseif ( $doctypeName === null ) { $doc = $impl->createDocument( null, null ); } else { $doctype = $impl->createDocumentType( $doctypeName, $public, $system ); @@ -82,10 +94,31 @@ $parent->insertBefore( $node, $refNode ); } + /** +* Replace unsupported characters with a code of the form U123456. +* +* @param string $name +* @return string +*/ + private function coerceName( $name ) { + $coercedName = DOMUtils::coerceName( $name ); + if ( $name !== $coercedName ) { + $this->coerced = true; + } + return $coercedName; + } + private function createNode( Element $element ) { - $node = $this->doc->createElementNS( - $element->namespace, - $element->name ); + try { + $node = $this->doc->createElementNS( + $element->namespace, + $element->name ); + } catch ( \DOMException $e ) { + // Attempt to escape the name so that it is more acceptable + $node = $this->doc->createElementNS( + $element->namespace, + $this->coerceName( $element->name ) ); + } foreach ( $element->attrs->getObjects() as $attr ) { if ( $attr->namespaceURI === null @@ -98,12 +131,26 @@ // way can't be discovered via hasAttribute() or hasAttributeNS(). $attrNode = $this->doc->createAttribute( $attr->localName ); $attrNode->value = $attr->value; - $node->setAttributeNodeNS( $attrNode ); + try { + $node->setAttributeNodeNS( $attrNode ); + } catch ( \DOMException $e ) { + $node->setAttributeNS( + $attr->namespaceURI, + $this->coerceName( $attr->qualifiedName ), +