jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/335596 )
Change subject: Add XML infoset coercion to DOMBuilder ...................................................................... Add XML infoset coercion to DOMBuilder * In DOMBuilder, encode names that are invalid in XML * For performance, trigger encoding names only when an exception is caught. * Invert the encoding in TestFormatter, and optionally in HtmlFormatter * Re-enable tests that now pass Change-Id: Iddfa497e07636693baa280575b086f46adefe002 --- M src/DOM/DOMBuilder.php A src/DOM/DOMUtils.php M src/GenerateDataFiles.php M src/HTMLData.php M src/Serializer/HtmlFormatter.php M src/Serializer/TestFormatter.php M tests/phpunit/TreeBuilderTest.php 7 files changed, 277 insertions(+), 34 deletions(-) Approvals: Subramanya Sastry: Looks good to me, approved jenkins-bot: Verified diff --git a/src/DOM/DOMBuilder.php b/src/DOM/DOMBuilder.php index 5519e17..e4de294 100644 --- a/src/DOM/DOMBuilder.php +++ b/src/DOM/DOMBuilder.php @@ -10,14 +10,15 @@ * A TreeHandler which constructs a DOMDocument */ class DOMBuilder implements TreeHandler { - private $doc; - private $errorCallback; - private $isFragment; - public $doctypeName; public $public; public $system; public $quirks; + + private $doc; + private $errorCallback; + private $isFragment; + private $coerced; /** * @param callable|null $errorCallback A function which is called on parse errors @@ -45,6 +46,16 @@ } } + /** + * Returns true if the document was coerced due to libxml limitations. We + * follow HTML 5.1 ยง 8.2.7 "Coercing an HTML DOM into an infoset". + * + * @return bool + */ + public function isCoerced() { + return $this->coerced; + } + public function startDocument( $fragmentNamespace, $fragmentName ) { $impl = new \DOMImplementation; $this->isFragment = $fragmentNamespace !== null; @@ -53,9 +64,10 @@ private function createDocument( $doctypeName = null, $public = null, $system = null ) { $impl = new \DOMImplementation; - if ( $doctypeName === null - || $doctypeName === '' // libxml limitation, causes test failures - ) { + if ( $doctypeName === '' ) { + $this->coerced = true; + $doc = $impl->createDocument( null, null ); + } elseif ( $doctypeName === null ) { $doc = $impl->createDocument( null, null ); } else { $doctype = $impl->createDocumentType( $doctypeName, $public, $system ); @@ -82,10 +94,31 @@ $parent->insertBefore( $node, $refNode ); } + /** + * Replace unsupported characters with a code of the form U123456. + * + * @param string $name + * @return string + */ + private function coerceName( $name ) { + $coercedName = DOMUtils::coerceName( $name ); + if ( $name !== $coercedName ) { + $this->coerced = true; + } + return $coercedName; + } + private function createNode( Element $element ) { - $node = $this->doc->createElementNS( - $element->namespace, - $element->name ); + try { + $node = $this->doc->createElementNS( + $element->namespace, + $element->name ); + } catch ( \DOMException $e ) { + // Attempt to escape the name so that it is more acceptable + $node = $this->doc->createElementNS( + $element->namespace, + $this->coerceName( $element->name ) ); + } foreach ( $element->attrs->getObjects() as $attr ) { if ( $attr->namespaceURI === null @@ -98,12 +131,26 @@ // way can't be discovered via hasAttribute() or hasAttributeNS(). $attrNode = $this->doc->createAttribute( $attr->localName ); $attrNode->value = $attr->value; - $node->setAttributeNodeNS( $attrNode ); + try { + $node->setAttributeNodeNS( $attrNode ); + } catch ( \DOMException $e ) { + $node->setAttributeNS( + $attr->namespaceURI, + $this->coerceName( $attr->qualifiedName ), + $attr->value ); + } } else { - $node->setAttributeNS( - $attr->namespaceURI, - $attr->qualifiedName, - $attr->value ); + try { + $node->setAttributeNS( + $attr->namespaceURI, + $attr->qualifiedName, + $attr->value ); + } catch ( \DOMException $e ) { + $node->setAttributeNS( + $attr->namespaceURI, + $this->coerceName( $attr->qualifiedName ), + $attr->value ); + } } } $element->userData = $node; @@ -164,17 +211,41 @@ // instead. $attrNode = $this->doc->createAttribute( $attr->localName ); $attrNode->value = $attr->value; - $replaced = $node->setAttributeNodeNS( $attrNode ); + try { + $replaced = $node->setAttributeNodeNS( $attrNode ); + } catch ( \DOMException $e ) { + $attrNode = $this->doc->createAttribute( + $this->coerceName( $attr->localName ) ); + $attrNode->value = $attr->value; + $replaced = $node->setAttributeNodeNS( $attrNode ); + } if ( $replaced ) { // Put it back how it was $node->setAttributeNodeNS( $replaced ); } } elseif ( $attr->namespaceURI === null ) { - if ( !$node->hasAttribute( $attr->localName ) ) { - $node->setAttribute( $attr->localName, $attr->value ); + try { + if ( !$node->hasAttribute( $attr->localName ) ) { + $node->setAttribute( $attr->localName, $attr->value ); + } + } catch ( \DOMException $e ) { + $name = $this->coerceName( $attr->localName ); + if ( !$node->hasAttribute( $name ) ) { + $node->setAttribute( $name, $attr->value ); + } } - } elseif ( !$node->hasAttributeNS( $attr->namespaceURI, $attr->localName ) ) { - $node->setAttributeNS( $attr->namespaceURI, $attr->localName, $attr->value ); + } else { + try { + if ( !$node->hasAttributeNS( $attr->namespaceURI, $attr->localName ) ) { + $node->setAttributeNS( $attr->namespaceURI, + $attr->localName, $attr->value ); + } + } catch ( \DOMException $e ) { + $name = $this->coerceName( $attr->localName ); + if ( !$node->hasAttributeNS( $attr->namespaceURI, $name ) ) { + $node->setAttributeNS( $attr->namespaceURI, $name, $attr->value ); + } + } } } } diff --git a/src/DOM/DOMUtils.php b/src/DOM/DOMUtils.php new file mode 100644 index 0000000..46dea19 --- /dev/null +++ b/src/DOM/DOMUtils.php @@ -0,0 +1,39 @@ +<?php + +namespace RemexHtml\DOM; +use RemexHtml\HTMLData; + +class DOMUtils { + /** + * Replace unsupported characters with a code of the form U123456. + * + * @param string $name + * @return string + */ + public static function coerceName( $name ) { + $coercedName = + mb_encode_numericentity( mb_substr( $name, 0, 1, 'UTF-8' ), + HTMLData::$nameStartCharConvTable, 'UTF-8', true ) . + mb_encode_numericentity( mb_substr( $name, 1, null, 'UTF-8' ), + HTMLData::$nameCharConvTable, 'UTF-8', true ); + $coercedName = preg_replace_callback( '/&#x([0-9A-F]*);/', + function ( $m ) { + return 'U' . str_pad( $m[1], 6, '0', STR_PAD_LEFT ); + }, + $coercedName ); + return $coercedName; + } + + /** + * Invert the encoding produced by coerceName() + * + * @param string $name + * @return string + */ + public static function uncoerceName( $name ) { + return mb_decode_numericentity( + preg_replace( '/U([0-9A-F]{6})/', '&#x\1;', $name ), + [ 0, 0x10ffff, 0, 0xffffff ], + 'UTF-8' ); + } +} diff --git a/src/GenerateDataFiles.php b/src/GenerateDataFiles.php index d849596..13430ff 100644 --- a/src/GenerateDataFiles.php +++ b/src/GenerateDataFiles.php @@ -134,6 +134,18 @@ self::NS_SVG => 'foreignObject, desc, title', ]; + // @codingStandardsIgnoreStart + /** + * The NameStartChar production from XML 1.0, but with colon excluded since + * there's a lot of ways to break namespace validation, and we actually need + * this for local names + */ + private static $nameStartChar = '[A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]'; + + /** The NameChar production from XML 1.0 */ + private static $nameChar = 'NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]'; + // @codingStandardsIgnoreEnd + private function makeRegexAlternation( $array ) { $regex = ''; foreach ( $array as $value ) { @@ -143,6 +155,79 @@ $regex .= "\n\t\t" . preg_quote( substr( $value, 1 ), '~' ); } return $regex; + } + + private function getCharRanges( $input, $nonterminals = [] ) { + $ranges = []; + + foreach ( preg_split( '/\s*\|\s*/', $input ) as $case ) { + if ( preg_match( '/^"(.)"$/', $case, $m ) ) { + // Single ASCII character + $ranges[] = [ ord( $m[1] ), ord( $m[1] ) ]; + } elseif ( preg_match( '/^\[(.)-(.)\]$/', $case, $m ) ) { + // ASCII range + $ranges[] = [ ord( $m[1] ), ord( $m[2] ) ]; + } elseif ( preg_match( '/^#x([0-9A-F]+)$/', $case, $m ) ) { + // Single encoded character + $codepoint = intval( $m[1], 16 ); + $ranges[] = [ $codepoint, $codepoint ]; + } elseif ( preg_match( '/^\[#x([0-9A-F]+)-#x([0-9A-F]+)\]$/', $case, $m ) ) { + // Encoded range + $ranges[] = [ intval( $m[1], 16 ), intval( $m[2], 16 ) ]; + } elseif ( isset( $nonterminals[$case] ) ) { + $ranges = array_merge( $ranges, $this->getCharRanges( $nonterminals[$case] ) ); + } else { + throw new \Exception( "Invalid XML char case \"$case\"" ); + } + } + usort( $ranges, function ( $a, $b ) { + return $a[0] - $b[0]; + } ); + return $ranges; + } + + private function makeConvTable( $input, $nonterminals = [] ) { + $ranges = $this->getCharRanges( $input, $nonterminals ); + + // Invert the ranges, produce a set complement + $lastEndPlusOne = 0; + $table = []; + for ( $i = 0; $i < count( $ranges ); $i++ ) { + $start = $ranges[$i][0]; + $end = $ranges[$i][1]; + // Merge consecutive ranges + for ( $j = $i + 1; $j < count( $ranges ); $j++ ) { + if ( $ranges[$j][0] === $end + 1 ) { + $end = $ranges[$j][1]; + $i = $j; + } else { + break; + } + } + + $table[] = $lastEndPlusOne; + $table[] = $start - 1; + $table[] = 0; + $table[] = 0xffffff; + + $lastEndPlusOne = $end + 1; + } + + // Last range + $table[] = $lastEndPlusOne; + $table[] = 0x10ffff; + $table[] = 0; + $table[] = 0xffffff; + + return $table; + } + + private function encodeConvTable( $table ) { + return "[\n\t\t" . implode( ",\n\t\t", array_map( + function ( $a ) { + return implode( ', ', $a ); + }, + array_chunk( $table, 4 ) ) ) . ' ]'; } private function execute() { @@ -188,10 +273,16 @@ $this->makeRegexAlternation( self::$quirkyPublicPrefixes ) . '~xAi'; + $nameStartCharConvTable = $this->makeConvTable( self::$nameStartChar ); + $nameCharConvTable = $this->makeConvTable( self::$nameChar, + [ 'NameStartChar' => self::$nameStartChar ] ); + $encEntityRegex = var_export( $entityRegex, true ); $encTranslations = var_export( $entityTranslations, true ); $encLegacy = var_export( $legacyNumericEntities, true ); $encQuirkyRegex = var_export( $quirkyRegex, true ); + $encNameStartCharConvTable = $this->encodeConvTable( $nameStartCharConvTable ); + $encNameCharConvTable = $this->encodeConvTable( $nameCharConvTable ); $special = []; foreach ( self::$special as $ns => $str ) { @@ -223,6 +314,8 @@ static public \$namedEntityTranslations = $encTranslations; static public \$legacyNumericEntities = $encLegacy; static public \$quirkyPrefixRegex = $encQuirkyRegex; + static public \$nameStartCharConvTable = $encNameStartCharConvTable; + static public \$nameCharConvTable = $encNameCharConvTable; } PHP; diff --git a/src/HTMLData.php b/src/HTMLData.php index 4a47d63..3da2890 100644 --- a/src/HTMLData.php +++ b/src/HTMLData.php @@ -4667,4 +4667,41 @@ //W3O//DTD W3 HTML 3\\.0//| //WebTechs//DTD Mozilla HTML 2\\.0//| //WebTechs//DTD Mozilla HTML//~xAi'; + static public $nameStartCharConvTable = [ + 0, 64, 0, 16777215, + 91, 94, 0, 16777215, + 96, 96, 0, 16777215, + 123, 191, 0, 16777215, + 215, 215, 0, 16777215, + 247, 247, 0, 16777215, + 768, 879, 0, 16777215, + 894, 894, 0, 16777215, + 8192, 8203, 0, 16777215, + 8206, 8303, 0, 16777215, + 8592, 11263, 0, 16777215, + 12272, 12288, 0, 16777215, + 55296, 63743, 0, 16777215, + 64976, 65007, 0, 16777215, + 65534, 65535, 0, 16777215, + 983040, 1114111, 0, 16777215 ]; + static public $nameCharConvTable = [ + 0, 44, 0, 16777215, + 47, 47, 0, 16777215, + 58, 64, 0, 16777215, + 91, 94, 0, 16777215, + 96, 96, 0, 16777215, + 123, 182, 0, 16777215, + 184, 191, 0, 16777215, + 215, 215, 0, 16777215, + 247, 247, 0, 16777215, + 894, 894, 0, 16777215, + 8192, 8203, 0, 16777215, + 8206, 8254, 0, 16777215, + 8257, 8303, 0, 16777215, + 8592, 11263, 0, 16777215, + 12272, 12288, 0, 16777215, + 55296, 63743, 0, 16777215, + 64976, 65007, 0, 16777215, + 65534, 65535, 0, 16777215, + 983040, 1114111, 0, 16777215 ]; } \ No newline at end of file diff --git a/src/Serializer/HtmlFormatter.php b/src/Serializer/HtmlFormatter.php index db61857..5445c1e 100644 --- a/src/Serializer/HtmlFormatter.php +++ b/src/Serializer/HtmlFormatter.php @@ -2,6 +2,7 @@ namespace RemexHtml\Serializer; use RemexHtml\HTMLData; +use RemexHtml\DOM\DOMUtils; use RemexHtml\DOM\DOMFormatter; /** @@ -83,6 +84,7 @@ ]; protected $useSourceDoctype; + protected $reverseCoercion; /** * Constructor. @@ -91,16 +93,20 @@ * - scriptingFlag : Set this to false to disable scripting. True by default. * - useSourceDoctype : Emit the doctype used in the source. If this is * false or absent, an HTML doctype will be used. + * - reverseCoercion : When formatting a DOM node, reverse the encoding + * of invalid names. False by default. */ public function __construct( $options = [] ) { $options += [ 'scriptingFlag' => true, 'useSourceDoctype' => false, + 'reverseCoercion' => false, ]; if ( $options['scriptingFlag'] ) { $this->rawTextElements['noscript'] = true; } $this->useSourceDoctype = $options['useSourceDoctype']; + $this->reverseCoercion = $options['reverseCoercion']; } public function startDocument( $fragmentNamespace, $fragmentName ) { @@ -216,6 +222,10 @@ } else { $name = $node->prefix . ':' . $node->localName; } + if ( $this->reverseCoercion ) { + $name = DOMUtils::uncoerceName( $name ); + } + $s = '<' . $name; foreach ( $node->attributes as $attr ) { switch ( $attr->namespaceURI ) { @@ -239,6 +249,9 @@ $attrName = $attr->localName; } } + if ( $this->reverseCoercion ) { + $attrName = DOMUtils::uncoerceName( $attrName ); + } $encValue = strtr( $attr->value, $this->attributeEscapes ); $s .= " $attrName=\"$encValue\""; } diff --git a/src/Serializer/TestFormatter.php b/src/Serializer/TestFormatter.php index 65af2b3..b2580b2 100644 --- a/src/Serializer/TestFormatter.php +++ b/src/Serializer/TestFormatter.php @@ -6,6 +6,7 @@ use RemexHtml\Tokenizer\PlainAttributes; use RemexHtml\HTMLData; use RemexHtml\DOM\DOMFormatter; +use RemexHtml\DOM\DOMUtils; /** * A Formatter which is used to format documents in (almost) the way they @@ -48,6 +49,7 @@ } private function formatElement( $namespace, $name, $attrs, $contents ) { + $name = DOMUtils::uncoerceName( $name ); if ( $namespace === HTMLData::NS_HTML ) { $tagName = $name; } elseif ( $namespace === HTMLData::NS_SVG ) { @@ -61,6 +63,7 @@ $sortedAttrs = $attrs; ksort( $sortedAttrs, SORT_STRING ); foreach ( $sortedAttrs as $attrName => $attr ) { + $localName = DOMUtils::uncoerceName( $attr->localName ); if ( $attr->namespaceURI === null || isset( $attr->reallyNoNamespace ) ) { @@ -68,7 +71,7 @@ } elseif ( isset( self::$attrNamespaces[$attr->namespaceURI] ) ) { $prefix = self::$attrNamespaces[$attr->namespaceURI] . ' '; } - $ret .= " $prefix{$attr->localName}=\"{$attr->value}\"\n"; + $ret .= " $prefix$localName=\"{$attr->value}\"\n"; } if ( $contents !== null && $contents !== '' ) { $contents = preg_replace( '/^/m', ' ', $contents ); diff --git a/tests/phpunit/TreeBuilderTest.php b/tests/phpunit/TreeBuilderTest.php index 63fbf6f..b9aeeeb 100644 --- a/tests/phpunit/TreeBuilderTest.php +++ b/tests/phpunit/TreeBuilderTest.php @@ -21,19 +21,6 @@ ]; private static $domTestBlacklist = [ - // Invalid tag name - 'tree-construction/html5test-com.dat:1', - 'tree-construction/webkit01.dat:179', - - // Invalid attribute name - 'tree-construction/html5test-com.dat:12', - 'tree-construction/html5test-com.dat:39', - 'tree-construction/tests14.dat:45', - 'tree-construction/tests14.dat:55', - 'tree-construction/tests14.dat:67', - 'tree-construction/tests26.dat:263', - 'tree-construction/webkit01.dat:606', - // Invalid doctype 'tree-construction/doctype01.dat:32', 'tree-construction/doctype01.dat:45', -- To view, visit https://gerrit.wikimedia.org/r/335596 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Iddfa497e07636693baa280575b086f46adefe002 Gerrit-PatchSet: 2 Gerrit-Project: mediawiki/libs/RemexHtml Gerrit-Branch: master Gerrit-Owner: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits