[MediaWiki-commits] [Gerrit] mediawiki...RemexHtml[master]: Add XML infoset coercion to DOMBuilder

2017-02-20 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/335596 )

Change subject: Add XML infoset coercion to DOMBuilder
..


Add XML infoset coercion to DOMBuilder

* In DOMBuilder, encode names that are invalid in XML
* For performance, trigger encoding names only when an exception is caught.
* Invert the encoding in TestFormatter, and optionally in HtmlFormatter
* Re-enable tests that now pass

Change-Id: Iddfa497e07636693baa280575b086f46adefe002
---
M src/DOM/DOMBuilder.php
A src/DOM/DOMUtils.php
M src/GenerateDataFiles.php
M src/HTMLData.php
M src/Serializer/HtmlFormatter.php
M src/Serializer/TestFormatter.php
M tests/phpunit/TreeBuilderTest.php
7 files changed, 277 insertions(+), 34 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/src/DOM/DOMBuilder.php b/src/DOM/DOMBuilder.php
index 5519e17..e4de294 100644
--- a/src/DOM/DOMBuilder.php
+++ b/src/DOM/DOMBuilder.php
@@ -10,14 +10,15 @@
  * A TreeHandler which constructs a DOMDocument
  */
 class DOMBuilder implements TreeHandler {
-   private $doc;
-   private $errorCallback;
-   private $isFragment;
-
public $doctypeName;
public $public;
public $system;
public $quirks;
+
+   private $doc;
+   private $errorCallback;
+   private $isFragment;
+   private $coerced;
 
/**
 * @param callable|null $errorCallback A function which is called on 
parse errors
@@ -45,6 +46,16 @@
}
}
 
+   /**
+* Returns true if the document was coerced due to libxml limitations. 
We
+* follow HTML 5.1 § 8.2.7 "Coercing an HTML DOM into an infoset".
+*
+* @return bool
+*/
+   public function isCoerced() {
+   return $this->coerced;
+   }
+
public function startDocument( $fragmentNamespace, $fragmentName ) {
$impl = new \DOMImplementation;
$this->isFragment = $fragmentNamespace !== null;
@@ -53,9 +64,10 @@
 
private function createDocument( $doctypeName = null, $public = null, 
$system = null ) {
$impl = new \DOMImplementation;
-   if ( $doctypeName === null
-   || $doctypeName === '' // libxml limitation, causes 
test failures
-   ) {
+   if ( $doctypeName === '' ) {
+   $this->coerced = true;
+   $doc = $impl->createDocument( null, null );
+   } elseif ( $doctypeName === null ) {
$doc = $impl->createDocument( null, null );
} else {
$doctype = $impl->createDocumentType( $doctypeName, 
$public, $system );
@@ -82,10 +94,31 @@
$parent->insertBefore( $node, $refNode );
}
 
+   /**
+* Replace unsupported characters with a code of the form U123456.
+*
+* @param string $name
+* @return string
+*/
+   private function coerceName( $name ) {
+   $coercedName = DOMUtils::coerceName( $name );
+   if ( $name !== $coercedName ) {
+   $this->coerced = true;
+   }
+   return $coercedName;
+   }
+
private function createNode( Element $element ) {
-   $node = $this->doc->createElementNS(
-   $element->namespace,
-   $element->name );
+   try {
+   $node = $this->doc->createElementNS(
+   $element->namespace,
+   $element->name );
+   } catch ( \DOMException $e ) {
+   // Attempt to escape the name so that it is more 
acceptable
+   $node = $this->doc->createElementNS(
+   $element->namespace,
+   $this->coerceName( $element->name ) );
+   }
 
foreach ( $element->attrs->getObjects() as $attr ) {
if ( $attr->namespaceURI === null
@@ -98,12 +131,26 @@
// way can't be discovered via hasAttribute() 
or hasAttributeNS().
$attrNode = $this->doc->createAttribute( 
$attr->localName );
$attrNode->value = $attr->value;
-   $node->setAttributeNodeNS( $attrNode );
+   try {
+   $node->setAttributeNodeNS( $attrNode );
+   } catch ( \DOMException $e ) {
+   $node->setAttributeNS(
+   $attr->namespaceURI,
+   $this->coerceName( 
$attr->qualifiedName ),
+   

[MediaWiki-commits] [Gerrit] mediawiki...RemexHtml[master]: Add XML infoset coercion to DOMBuilder

2017-02-01 Thread Tim Starling (Code Review)
Tim Starling has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/335596 )

Change subject: Add XML infoset coercion to DOMBuilder
..

Add XML infoset coercion to DOMBuilder

* In DOMBuilder, encode names that are invalid in XML
* For performance, trigger encoding names only when an exception is caught.
* Invert the encoding in TestFormatter, and optionally in HtmlFormatter
* Re-enable tests that now pass

Change-Id: Iddfa497e07636693baa280575b086f46adefe002
---
M src/DOM/DOMBuilder.php
M src/GenerateDataFiles.php
M src/HTMLData.php
M src/Serializer/HtmlFormatter.php
M src/Serializer/TestFormatter.php
M tests/phpunit/TreeBuilderTest.php
6 files changed, 238 insertions(+), 34 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/libs/RemexHtml 
refs/changes/96/335596/1

diff --git a/src/DOM/DOMBuilder.php b/src/DOM/DOMBuilder.php
index 5519e17..e4de294 100644
--- a/src/DOM/DOMBuilder.php
+++ b/src/DOM/DOMBuilder.php
@@ -10,14 +10,15 @@
  * A TreeHandler which constructs a DOMDocument
  */
 class DOMBuilder implements TreeHandler {
-   private $doc;
-   private $errorCallback;
-   private $isFragment;
-
public $doctypeName;
public $public;
public $system;
public $quirks;
+
+   private $doc;
+   private $errorCallback;
+   private $isFragment;
+   private $coerced;
 
/**
 * @param callable|null $errorCallback A function which is called on 
parse errors
@@ -45,6 +46,16 @@
}
}
 
+   /**
+* Returns true if the document was coerced due to libxml limitations. 
We
+* follow HTML 5.1 § 8.2.7 "Coercing an HTML DOM into an infoset".
+*
+* @return bool
+*/
+   public function isCoerced() {
+   return $this->coerced;
+   }
+
public function startDocument( $fragmentNamespace, $fragmentName ) {
$impl = new \DOMImplementation;
$this->isFragment = $fragmentNamespace !== null;
@@ -53,9 +64,10 @@
 
private function createDocument( $doctypeName = null, $public = null, 
$system = null ) {
$impl = new \DOMImplementation;
-   if ( $doctypeName === null
-   || $doctypeName === '' // libxml limitation, causes 
test failures
-   ) {
+   if ( $doctypeName === '' ) {
+   $this->coerced = true;
+   $doc = $impl->createDocument( null, null );
+   } elseif ( $doctypeName === null ) {
$doc = $impl->createDocument( null, null );
} else {
$doctype = $impl->createDocumentType( $doctypeName, 
$public, $system );
@@ -82,10 +94,31 @@
$parent->insertBefore( $node, $refNode );
}
 
+   /**
+* Replace unsupported characters with a code of the form U123456.
+*
+* @param string $name
+* @return string
+*/
+   private function coerceName( $name ) {
+   $coercedName = DOMUtils::coerceName( $name );
+   if ( $name !== $coercedName ) {
+   $this->coerced = true;
+   }
+   return $coercedName;
+   }
+
private function createNode( Element $element ) {
-   $node = $this->doc->createElementNS(
-   $element->namespace,
-   $element->name );
+   try {
+   $node = $this->doc->createElementNS(
+   $element->namespace,
+   $element->name );
+   } catch ( \DOMException $e ) {
+   // Attempt to escape the name so that it is more 
acceptable
+   $node = $this->doc->createElementNS(
+   $element->namespace,
+   $this->coerceName( $element->name ) );
+   }
 
foreach ( $element->attrs->getObjects() as $attr ) {
if ( $attr->namespaceURI === null
@@ -98,12 +131,26 @@
// way can't be discovered via hasAttribute() 
or hasAttributeNS().
$attrNode = $this->doc->createAttribute( 
$attr->localName );
$attrNode->value = $attr->value;
-   $node->setAttributeNodeNS( $attrNode );
+   try {
+   $node->setAttributeNodeNS( $attrNode );
+   } catch ( \DOMException $e ) {
+   $node->setAttributeNS(
+   $attr->namespaceURI,
+   $this->coerceName( 
$attr->qualifiedName ),
+