jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/335596 )

Change subject: Add XML infoset coercion to DOMBuilder
......................................................................


Add XML infoset coercion to DOMBuilder

* In DOMBuilder, encode names that are invalid in XML
* For performance, trigger encoding names only when an exception is caught.
* Invert the encoding in TestFormatter, and optionally in HtmlFormatter
* Re-enable tests that now pass

Change-Id: Iddfa497e07636693baa280575b086f46adefe002
---
M src/DOM/DOMBuilder.php
A src/DOM/DOMUtils.php
M src/GenerateDataFiles.php
M src/HTMLData.php
M src/Serializer/HtmlFormatter.php
M src/Serializer/TestFormatter.php
M tests/phpunit/TreeBuilderTest.php
7 files changed, 277 insertions(+), 34 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/src/DOM/DOMBuilder.php b/src/DOM/DOMBuilder.php
index 5519e17..e4de294 100644
--- a/src/DOM/DOMBuilder.php
+++ b/src/DOM/DOMBuilder.php
@@ -10,14 +10,15 @@
  * A TreeHandler which constructs a DOMDocument
  */
 class DOMBuilder implements TreeHandler {
-       private $doc;
-       private $errorCallback;
-       private $isFragment;
-
        public $doctypeName;
        public $public;
        public $system;
        public $quirks;
+
+       private $doc;
+       private $errorCallback;
+       private $isFragment;
+       private $coerced;
 
        /**
         * @param callable|null $errorCallback A function which is called on 
parse errors
@@ -45,6 +46,16 @@
                }
        }
 
+       /**
+        * Returns true if the document was coerced due to libxml limitations. 
We
+        * follow HTML 5.1 ยง 8.2.7 "Coercing an HTML DOM into an infoset".
+        *
+        * @return bool
+        */
+       public function isCoerced() {
+               return $this->coerced;
+       }
+
        public function startDocument( $fragmentNamespace, $fragmentName ) {
                $impl = new \DOMImplementation;
                $this->isFragment = $fragmentNamespace !== null;
@@ -53,9 +64,10 @@
 
        private function createDocument( $doctypeName = null, $public = null, 
$system = null ) {
                $impl = new \DOMImplementation;
-               if ( $doctypeName === null
-                       || $doctypeName === '' // libxml limitation, causes 
test failures
-               ) {
+               if ( $doctypeName === '' ) {
+                       $this->coerced = true;
+                       $doc = $impl->createDocument( null, null );
+               } elseif ( $doctypeName === null ) {
                        $doc = $impl->createDocument( null, null );
                } else {
                        $doctype = $impl->createDocumentType( $doctypeName, 
$public, $system );
@@ -82,10 +94,31 @@
                $parent->insertBefore( $node, $refNode );
        }
 
+       /**
+        * Replace unsupported characters with a code of the form U123456.
+        *
+        * @param string $name
+        * @return string
+        */
+       private function coerceName( $name ) {
+               $coercedName = DOMUtils::coerceName( $name );
+               if ( $name !== $coercedName ) {
+                       $this->coerced = true;
+               }
+               return $coercedName;
+       }
+
        private function createNode( Element $element ) {
-               $node = $this->doc->createElementNS(
-                       $element->namespace,
-                       $element->name );
+               try {
+                       $node = $this->doc->createElementNS(
+                               $element->namespace,
+                               $element->name );
+               } catch ( \DOMException $e ) {
+                       // Attempt to escape the name so that it is more 
acceptable
+                       $node = $this->doc->createElementNS(
+                               $element->namespace,
+                               $this->coerceName( $element->name ) );
+               }
 
                foreach ( $element->attrs->getObjects() as $attr ) {
                        if ( $attr->namespaceURI === null
@@ -98,12 +131,26 @@
                                // way can't be discovered via hasAttribute() 
or hasAttributeNS().
                                $attrNode = $this->doc->createAttribute( 
$attr->localName );
                                $attrNode->value = $attr->value;
-                               $node->setAttributeNodeNS( $attrNode );
+                               try {
+                                       $node->setAttributeNodeNS( $attrNode );
+                               } catch ( \DOMException $e ) {
+                                       $node->setAttributeNS(
+                                               $attr->namespaceURI,
+                                               $this->coerceName( 
$attr->qualifiedName ),
+                                               $attr->value );
+                               }
                        } else {
-                               $node->setAttributeNS(
-                                       $attr->namespaceURI,
-                                       $attr->qualifiedName,
-                                       $attr->value );
+                               try {
+                                       $node->setAttributeNS(
+                                               $attr->namespaceURI,
+                                               $attr->qualifiedName,
+                                               $attr->value );
+                               } catch ( \DOMException $e ) {
+                                       $node->setAttributeNS(
+                                               $attr->namespaceURI,
+                                               $this->coerceName( 
$attr->qualifiedName ),
+                                               $attr->value );
+                               }
                        }
                }
                $element->userData = $node;
@@ -164,17 +211,41 @@
                                // instead.
                                $attrNode = $this->doc->createAttribute( 
$attr->localName );
                                $attrNode->value = $attr->value;
-                               $replaced = $node->setAttributeNodeNS( 
$attrNode );
+                               try {
+                                       $replaced = $node->setAttributeNodeNS( 
$attrNode );
+                               } catch ( \DOMException $e ) {
+                                       $attrNode = $this->doc->createAttribute(
+                                               $this->coerceName( 
$attr->localName ) );
+                                       $attrNode->value = $attr->value;
+                                       $replaced = $node->setAttributeNodeNS( 
$attrNode );
+                               }
                                if ( $replaced ) {
                                        // Put it back how it was
                                        $node->setAttributeNodeNS( $replaced );
                                }
                        } elseif ( $attr->namespaceURI === null ) {
-                               if ( !$node->hasAttribute( $attr->localName ) ) 
{
-                                       $node->setAttribute( $attr->localName, 
$attr->value );
+                               try {
+                                       if ( !$node->hasAttribute( 
$attr->localName ) ) {
+                                               $node->setAttribute( 
$attr->localName, $attr->value );
+                                       }
+                               } catch ( \DOMException $e ) {
+                                       $name = $this->coerceName( 
$attr->localName );
+                                       if ( !$node->hasAttribute( $name ) ) {
+                                               $node->setAttribute( $name, 
$attr->value );
+                                       }
                                }
-                       } elseif ( !$node->hasAttributeNS( $attr->namespaceURI, 
$attr->localName ) ) {
-                               $node->setAttributeNS( $attr->namespaceURI, 
$attr->localName, $attr->value );
+                       } else {
+                               try {
+                                       if ( !$node->hasAttributeNS( 
$attr->namespaceURI, $attr->localName ) ) {
+                                               $node->setAttributeNS( 
$attr->namespaceURI,
+                                                       $attr->localName, 
$attr->value );
+                                       }
+                               } catch ( \DOMException $e ) {
+                                       $name = $this->coerceName( 
$attr->localName );
+                                       if ( !$node->hasAttributeNS( 
$attr->namespaceURI, $name ) ) {
+                                               $node->setAttributeNS( 
$attr->namespaceURI, $name, $attr->value );
+                                       }
+                               }
                        }
                }
        }
diff --git a/src/DOM/DOMUtils.php b/src/DOM/DOMUtils.php
new file mode 100644
index 0000000..46dea19
--- /dev/null
+++ b/src/DOM/DOMUtils.php
@@ -0,0 +1,39 @@
+<?php
+
+namespace RemexHtml\DOM;
+use RemexHtml\HTMLData;
+
+class DOMUtils {
+       /**
+        * Replace unsupported characters with a code of the form U123456.
+        *
+        * @param string $name
+        * @return string
+        */
+       public static function coerceName( $name ) {
+               $coercedName =
+                       mb_encode_numericentity( mb_substr( $name, 0, 1, 
'UTF-8' ),
+                               HTMLData::$nameStartCharConvTable, 'UTF-8', 
true ) .
+                       mb_encode_numericentity( mb_substr( $name, 1, null, 
'UTF-8' ),
+                               HTMLData::$nameCharConvTable, 'UTF-8', true );
+               $coercedName = preg_replace_callback( '/&#x([0-9A-F]*);/',
+                       function ( $m ) {
+                               return 'U' . str_pad( $m[1], 6, '0', 
STR_PAD_LEFT );
+                       },
+                       $coercedName );
+               return $coercedName;
+       }
+
+       /**
+        * Invert the encoding produced by coerceName()
+        *
+        * @param string $name
+        * @return string
+        */
+       public static function uncoerceName( $name ) {
+               return mb_decode_numericentity(
+                       preg_replace( '/U([0-9A-F]{6})/', '&#x\1;', $name ),
+                       [ 0, 0x10ffff, 0, 0xffffff ],
+                       'UTF-8' );
+       }
+}
diff --git a/src/GenerateDataFiles.php b/src/GenerateDataFiles.php
index d849596..13430ff 100644
--- a/src/GenerateDataFiles.php
+++ b/src/GenerateDataFiles.php
@@ -134,6 +134,18 @@
                self::NS_SVG => 'foreignObject, desc, title',
        ];
 
+       // @codingStandardsIgnoreStart
+       /**
+        * The NameStartChar production from XML 1.0, but with colon excluded 
since
+        * there's a lot of ways to break namespace validation, and we actually 
need
+        * this for local names
+        */
+       private static $nameStartChar = '[A-Z] | "_" | [a-z] | [#xC0-#xD6] | 
[#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | 
[#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | 
[#xFDF0-#xFFFD] | [#x10000-#xEFFFF]';
+
+       /** The NameChar production from XML 1.0 */
+       private static $nameChar = 'NameStartChar | "-" | "." | [0-9] | #xB7 | 
[#x0300-#x036F] | [#x203F-#x2040]';
+       // @codingStandardsIgnoreEnd
+
        private function makeRegexAlternation( $array ) {
                $regex = '';
                foreach ( $array as $value ) {
@@ -143,6 +155,79 @@
                        $regex .= "\n\t\t" . preg_quote( substr( $value, 1 ), 
'~' );
                }
                return $regex;
+       }
+
+       private function getCharRanges( $input, $nonterminals = [] ) {
+               $ranges = [];
+
+               foreach ( preg_split( '/\s*\|\s*/', $input ) as $case ) {
+                       if ( preg_match( '/^"(.)"$/', $case, $m ) ) {
+                               // Single ASCII character
+                               $ranges[] = [ ord( $m[1] ), ord( $m[1] ) ];
+                       } elseif ( preg_match( '/^\[(.)-(.)\]$/', $case, $m ) ) 
{
+                               // ASCII range
+                               $ranges[] = [ ord( $m[1] ), ord( $m[2] ) ];
+                       } elseif ( preg_match( '/^#x([0-9A-F]+)$/', $case, $m ) 
) {
+                               // Single encoded character
+                               $codepoint = intval( $m[1], 16 );
+                               $ranges[] = [ $codepoint, $codepoint ];
+                       } elseif ( preg_match( 
'/^\[#x([0-9A-F]+)-#x([0-9A-F]+)\]$/', $case, $m ) ) {
+                               // Encoded range
+                               $ranges[] = [ intval( $m[1], 16 ), intval( 
$m[2], 16 ) ];
+                       } elseif ( isset( $nonterminals[$case] ) ) {
+                               $ranges = array_merge( $ranges, 
$this->getCharRanges( $nonterminals[$case] ) );
+                       } else {
+                               throw new \Exception( "Invalid XML char case 
\"$case\"" );
+                       }
+               }
+               usort( $ranges, function ( $a, $b ) {
+                       return $a[0] - $b[0];
+               } );
+               return $ranges;
+       }
+
+       private function makeConvTable( $input, $nonterminals = [] ) {
+               $ranges = $this->getCharRanges( $input, $nonterminals );
+
+               // Invert the ranges, produce a set complement
+               $lastEndPlusOne = 0;
+               $table = [];
+               for ( $i = 0; $i < count( $ranges ); $i++ ) {
+                       $start = $ranges[$i][0];
+                       $end = $ranges[$i][1];
+                       // Merge consecutive ranges
+                       for ( $j = $i + 1; $j < count( $ranges ); $j++ ) {
+                               if ( $ranges[$j][0] === $end + 1 ) {
+                                       $end = $ranges[$j][1];
+                                       $i = $j;
+                               } else {
+                                       break;
+                               }
+                       }
+
+                       $table[] = $lastEndPlusOne;
+                       $table[] = $start - 1;
+                       $table[] = 0;
+                       $table[] = 0xffffff;
+
+                       $lastEndPlusOne = $end + 1;
+               }
+
+               // Last range
+               $table[] = $lastEndPlusOne;
+               $table[] = 0x10ffff;
+               $table[] = 0;
+               $table[] = 0xffffff;
+
+               return $table;
+       }
+
+       private function encodeConvTable( $table ) {
+               return "[\n\t\t" . implode( ",\n\t\t", array_map(
+                       function ( $a ) {
+                               return implode( ', ', $a );
+                       },
+                       array_chunk( $table, 4 ) ) ) . ' ]';
        }
 
        private function execute() {
@@ -188,10 +273,16 @@
                        $this->makeRegexAlternation( 
self::$quirkyPublicPrefixes ) .
                        '~xAi';
 
+               $nameStartCharConvTable = $this->makeConvTable( 
self::$nameStartChar );
+               $nameCharConvTable = $this->makeConvTable( self::$nameChar,
+                       [ 'NameStartChar' => self::$nameStartChar ] );
+
                $encEntityRegex = var_export( $entityRegex, true );
                $encTranslations = var_export( $entityTranslations, true );
                $encLegacy = var_export( $legacyNumericEntities, true );
                $encQuirkyRegex = var_export( $quirkyRegex, true );
+               $encNameStartCharConvTable = $this->encodeConvTable( 
$nameStartCharConvTable );
+               $encNameCharConvTable = $this->encodeConvTable( 
$nameCharConvTable );
 
                $special = [];
                foreach ( self::$special as $ns => $str ) {
@@ -223,6 +314,8 @@
        static public \$namedEntityTranslations = $encTranslations;
        static public \$legacyNumericEntities = $encLegacy;
        static public \$quirkyPrefixRegex = $encQuirkyRegex;
+       static public \$nameStartCharConvTable = $encNameStartCharConvTable;
+       static public \$nameCharConvTable = $encNameCharConvTable;
 }
 PHP;
 
diff --git a/src/HTMLData.php b/src/HTMLData.php
index 4a47d63..3da2890 100644
--- a/src/HTMLData.php
+++ b/src/HTMLData.php
@@ -4667,4 +4667,41 @@
                //W3O//DTD W3 HTML 3\\.0//|
                //WebTechs//DTD Mozilla HTML 2\\.0//|
                //WebTechs//DTD Mozilla HTML//~xAi';
+       static public $nameStartCharConvTable = [
+               0, 64, 0, 16777215,
+               91, 94, 0, 16777215,
+               96, 96, 0, 16777215,
+               123, 191, 0, 16777215,
+               215, 215, 0, 16777215,
+               247, 247, 0, 16777215,
+               768, 879, 0, 16777215,
+               894, 894, 0, 16777215,
+               8192, 8203, 0, 16777215,
+               8206, 8303, 0, 16777215,
+               8592, 11263, 0, 16777215,
+               12272, 12288, 0, 16777215,
+               55296, 63743, 0, 16777215,
+               64976, 65007, 0, 16777215,
+               65534, 65535, 0, 16777215,
+               983040, 1114111, 0, 16777215 ];
+       static public $nameCharConvTable = [
+               0, 44, 0, 16777215,
+               47, 47, 0, 16777215,
+               58, 64, 0, 16777215,
+               91, 94, 0, 16777215,
+               96, 96, 0, 16777215,
+               123, 182, 0, 16777215,
+               184, 191, 0, 16777215,
+               215, 215, 0, 16777215,
+               247, 247, 0, 16777215,
+               894, 894, 0, 16777215,
+               8192, 8203, 0, 16777215,
+               8206, 8254, 0, 16777215,
+               8257, 8303, 0, 16777215,
+               8592, 11263, 0, 16777215,
+               12272, 12288, 0, 16777215,
+               55296, 63743, 0, 16777215,
+               64976, 65007, 0, 16777215,
+               65534, 65535, 0, 16777215,
+               983040, 1114111, 0, 16777215 ];
 }
\ No newline at end of file
diff --git a/src/Serializer/HtmlFormatter.php b/src/Serializer/HtmlFormatter.php
index db61857..5445c1e 100644
--- a/src/Serializer/HtmlFormatter.php
+++ b/src/Serializer/HtmlFormatter.php
@@ -2,6 +2,7 @@
 
 namespace RemexHtml\Serializer;
 use RemexHtml\HTMLData;
+use RemexHtml\DOM\DOMUtils;
 use RemexHtml\DOM\DOMFormatter;
 
 /**
@@ -83,6 +84,7 @@
        ];
 
        protected $useSourceDoctype;
+       protected $reverseCoercion;
 
        /**
         * Constructor.
@@ -91,16 +93,20 @@
         *   - scriptingFlag : Set this to false to disable scripting. True by 
default.
         *   - useSourceDoctype : Emit the doctype used in the source. If this 
is
         *     false or absent, an HTML doctype will be used.
+        *   - reverseCoercion : When formatting a DOM node, reverse the 
encoding
+        *     of invalid names. False by default.
         */
        public function __construct( $options = [] ) {
                $options += [
                        'scriptingFlag' => true,
                        'useSourceDoctype' => false,
+                       'reverseCoercion' => false,
                ];
                if ( $options['scriptingFlag'] ) {
                        $this->rawTextElements['noscript'] = true;
                }
                $this->useSourceDoctype = $options['useSourceDoctype'];
+               $this->reverseCoercion = $options['reverseCoercion'];
        }
 
        public function startDocument( $fragmentNamespace, $fragmentName ) {
@@ -216,6 +222,10 @@
                } else {
                        $name = $node->prefix . ':' . $node->localName;
                }
+               if ( $this->reverseCoercion ) {
+                       $name = DOMUtils::uncoerceName( $name );
+               }
+
                $s = '<' . $name;
                foreach ( $node->attributes as $attr ) {
                        switch ( $attr->namespaceURI ) {
@@ -239,6 +249,9 @@
                                        $attrName = $attr->localName;
                                }
                        }
+                       if ( $this->reverseCoercion ) {
+                               $attrName = DOMUtils::uncoerceName( $attrName );
+                       }
                        $encValue = strtr( $attr->value, 
$this->attributeEscapes );
                        $s .= " $attrName=\"$encValue\"";
                }
diff --git a/src/Serializer/TestFormatter.php b/src/Serializer/TestFormatter.php
index 65af2b3..b2580b2 100644
--- a/src/Serializer/TestFormatter.php
+++ b/src/Serializer/TestFormatter.php
@@ -6,6 +6,7 @@
 use RemexHtml\Tokenizer\PlainAttributes;
 use RemexHtml\HTMLData;
 use RemexHtml\DOM\DOMFormatter;
+use RemexHtml\DOM\DOMUtils;
 
 /**
  * A Formatter which is used to format documents in (almost) the way they
@@ -48,6 +49,7 @@
        }
 
        private function formatElement( $namespace, $name, $attrs, $contents ) {
+               $name = DOMUtils::uncoerceName( $name );
                if ( $namespace === HTMLData::NS_HTML ) {
                        $tagName = $name;
                } elseif ( $namespace === HTMLData::NS_SVG ) {
@@ -61,6 +63,7 @@
                $sortedAttrs = $attrs;
                ksort( $sortedAttrs, SORT_STRING );
                foreach ( $sortedAttrs as $attrName => $attr ) {
+                       $localName = DOMUtils::uncoerceName( $attr->localName );
                        if ( $attr->namespaceURI === null
                                || isset( $attr->reallyNoNamespace )
                        ) {
@@ -68,7 +71,7 @@
                        } elseif ( isset( 
self::$attrNamespaces[$attr->namespaceURI] ) ) {
                                $prefix = 
self::$attrNamespaces[$attr->namespaceURI] . ' ';
                        }
-                       $ret .= "  
$prefix{$attr->localName}=\"{$attr->value}\"\n";
+                       $ret .= "  $prefix$localName=\"{$attr->value}\"\n";
                }
                if ( $contents !== null && $contents !== '' ) {
                        $contents = preg_replace( '/^/m', '  ', $contents );
diff --git a/tests/phpunit/TreeBuilderTest.php 
b/tests/phpunit/TreeBuilderTest.php
index 63fbf6f..b9aeeeb 100644
--- a/tests/phpunit/TreeBuilderTest.php
+++ b/tests/phpunit/TreeBuilderTest.php
@@ -21,19 +21,6 @@
        ];
 
        private static $domTestBlacklist = [
-               // Invalid tag name
-               'tree-construction/html5test-com.dat:1',
-               'tree-construction/webkit01.dat:179',
-
-               // Invalid attribute name
-               'tree-construction/html5test-com.dat:12',
-               'tree-construction/html5test-com.dat:39',
-               'tree-construction/tests14.dat:45',
-               'tree-construction/tests14.dat:55',
-               'tree-construction/tests14.dat:67',
-               'tree-construction/tests26.dat:263',
-               'tree-construction/webkit01.dat:606',
-
                // Invalid doctype
                'tree-construction/doctype01.dat:32',
                'tree-construction/doctype01.dat:45',

-- 
To view, visit https://gerrit.wikimedia.org/r/335596
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Iddfa497e07636693baa280575b086f46adefe002
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/libs/RemexHtml
Gerrit-Branch: master
Gerrit-Owner: Tim Starling <tstarl...@wikimedia.org>
Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to