MaxSem has submitted this change and it was merged.
Change subject: Cherry-pick HtmlFormatter into production
......................................................................
Cherry-pick HtmlFormatter into production
Contains commits:
* Ic276e1604c5718e8568e120ddfb9a8fc13a682fc "Move HtmlFormatter from
MobileFrontend"
* I5c6394f811f9cd14dc549d005b4583d7575e85aa "libxml_disable_entity_loader()
just in case..."
* I69623d565826aacaa884c1c0a3ffec46ca1fb465 "Followup to missing call in
Ic276e1604c5718e8568e120ddfb9a8fc13a682fc"
Change-Id: I4e3c5d9537b1ecffed0f62736784712187476bad
---
M includes/AutoLoader.php
A includes/HtmlFormatter.php
A tests/phpunit/includes/HtmlFormatterTest.php
3 files changed, 418 insertions(+), 0 deletions(-)
Approvals:
MaxSem: Verified; Looks good to me, approved
jenkins-bot: Verified
diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php
index 604add3..48e744e 100644
--- a/includes/AutoLoader.php
+++ b/includes/AutoLoader.php
@@ -111,6 +111,7 @@
'HistoryBlobStub' => 'includes/HistoryBlob.php',
'Hooks' => 'includes/Hooks.php',
'Html' => 'includes/Html.php',
+ 'HtmlFormatter' => 'includes/HtmlFormatter.php',
'HTMLApiField' => 'includes/HTMLForm.php',
'HTMLButtonField' => 'includes/HTMLForm.php',
'HTMLCheckField' => 'includes/HTMLForm.php',
diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php
new file mode 100644
index 0000000..125daaf
--- /dev/null
+++ b/includes/HtmlFormatter.php
@@ -0,0 +1,336 @@
+<?php
+/**
+ * Performs transformations of HTML by wrapping around libxml2 and working
+ * around its countless bugs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ */
+class HtmlFormatter {
+ /**
+ * @var DOMDocument
+ */
+ private $doc;
+
+ private $html;
+ private $itemsToRemove = array();
+ private $elementsToFlatten = array();
+ protected $removeMedia = false;
+
+ /**
+ * Constructor
+ *
+ * @param string $html: Text to process
+ */
+ public function __construct( $html ) {
+ $this->html = $html;
+ }
+
+ /**
+ * Turns a chunk of HTML into a proper document
+ * @param string $html
+ * @return string
+ */
+ public static function wrapHTML( $html ) {
+ return '<!doctype html><html><head></head><body>' . $html .
'</body></html>';
+ }
+
+ /**
+ * Override this in descendant class to modify HTML after it has been
converted from DOM tree
+ * @param string $html: HTML to process
+ * @return string: Processed HTML
+ */
+ protected function onHtmlReady( $html ) {
+ return $html;
+ }
+
+ /**
+ * @return DOMDocument: DOM to manipulate
+ */
+ public function getDoc() {
+ if ( !$this->doc ) {
+ $html = mb_convert_encoding( $this->html,
'HTML-ENTITIES', 'UTF-8' );
+
+ // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
+ $html = str_replace( ' <', ' <', $html );
+
+ libxml_use_internal_errors( true );
+ $loader = libxml_disable_entity_loader();
+ $this->doc = new DOMDocument();
+ $this->doc->strictErrorChecking = false;
+ $this->doc->loadHTML( $html );
+ libxml_disable_entity_loader( $loader );
+ libxml_use_internal_errors( false );
+ $this->doc->encoding = 'UTF-8';
+ }
+ return $this->doc;
+ }
+
+ /**
+ * Sets whether images/videos/sounds should be removed from output
+ * @param bool $flag
+ */
+ public function setRemoveMedia( $flag = true ) {
+ $this->removeMedia = $flag;
+ }
+
+ /**
+ * Adds one or more selector of content to remove
+ * @param Array|string $selectors: Selector(s) of stuff to remove
+ */
+ public function remove( $selectors ) {
+ $this->itemsToRemove = array_merge( $this->itemsToRemove,
(array)$selectors );
+ }
+
+ /**
+ * Adds one or more element name to the list to flatten (remove tag,
but not its content)
+ * Can accept undelimited regexes
+ * @param Array|string $elements: Name(s) of tag(s) to flatten
+ */
+ public function flatten( $elements ) {
+ $this->elementsToFlatten = array_merge(
$this->elementsToFlatten, (array)$elements );
+ }
+
+ /**
+ * Instructs the formatter to flatten all tags
+ */
+ public function flattenAllTags() {
+ $this->flatten( '[?!]?[a-z0-9]+' );
+ }
+
+ /**
+ * Removes content we've chosen to remove
+ */
+ public function filterContent() {
+ wfProfileIn( __METHOD__ );
+ $removals = $this->parseItemsToRemove();
+
+ if ( !$removals ) {
+ wfProfileOut( __METHOD__ );
+ return;
+ }
+
+ $doc = $this->getDoc();
+
+ // Remove tags
+
+ // You can't remove DOMNodes from a DOMNodeList as you're
iterating
+ // over them in a foreach loop. It will seemingly leave the
internal
+ // iterator on the foreach out of wack and results will be quite
+ // strange. Though, making a queue of items to remove seems to
work.
+ $domElemsToRemove = array();
+ foreach ( $removals['TAG'] as $tagToRemove ) {
+ $tagToRemoveNodes = $doc->getElementsByTagName(
$tagToRemove );
+ foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
+ if ( $tagToRemoveNode ) {
+ $domElemsToRemove[] = $tagToRemoveNode;
+ }
+ }
+ }
+
+ $this->removeElements( $domElemsToRemove );
+
+ // Elements with named IDs
+ $domElemsToRemove = array();
+ foreach ( $removals['ID'] as $itemToRemove ) {
+ $itemToRemoveNode = $doc->getElementById( $itemToRemove
);
+ if ( $itemToRemoveNode ) {
+ $domElemsToRemove[] = $itemToRemoveNode;
+ }
+ }
+ $this->removeElements( $domElemsToRemove );
+
+ // CSS Classes
+ $domElemsToRemove = array();
+ $xpath = new DOMXpath( $doc );
+ foreach ( $removals['CLASS'] as $classToRemove ) {
+ $elements = $xpath->query( '//*[contains(@class, "' .
$classToRemove . '")]' );
+
+ /** @var $element DOMElement */
+ foreach ( $elements as $element ) {
+ $classes = $element->getAttribute( 'class' );
+ if ( preg_match( "/\b$classToRemove\b/",
$classes ) && $element->parentNode ) {
+ $domElemsToRemove[] = $element;
+ }
+ }
+ }
+ $this->removeElements( $domElemsToRemove );
+
+ // Tags with CSS Classes
+ foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
+ $parts = explode( '.', $classToRemove );
+
+ $elements = $xpath->query(
+ '//' . $parts[0] . '[@class="' . $parts[1] .
'"]'
+ );
+
+ $this->removeElements( $elements );
+ }
+
+ wfProfileOut( __METHOD__ );
+ }
+
+ /**
+ * Removes a list of elelments from DOMDocument
+ * @param array|DOMNodeList $elements
+ */
+ private function removeElements( $elements ) {
+ $list = $elements;
+ if ( $elements instanceof DOMNodeList ) {
+ $list = array();
+ foreach ( $elements as $element ) {
+ $list[] = $element;
+ }
+ }
+ /** @var $element DOMElement */
+ foreach ( $list as $element ) {
+ if ( $element->parentNode ) {
+ $element->parentNode->removeChild( $element );
+ }
+ }
+ }
+
+ /**
+ * libxml in its usual pointlessness converts many chars to entities -
this function
+ * perfoms a reverse conversion
+ * @param string $html
+ * @return string
+ */
+ private function fixLibXML( $html ) {
+ wfProfileIn( __METHOD__ );
+ static $replacements;
+ if ( ! $replacements ) {
+ // We don't include rules like '"' => '&quot;'
because entities had already been
+ // normalized by libxml. Using this function with input
not sanitized by libxml is UNSAFE!
+ $replacements = new ReplacementArray( array(
+ '"' => '&quot;',
+ '&' => '&amp;',
+ '<' => '&lt;',
+ '>' => '&gt;',
+ ) );
+ }
+ $html = $replacements->replace( $html );
+ $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
+ wfProfileOut( __METHOD__ );
+ return $html;
+ }
+
+ /**
+ * Performs final transformations and returns resulting HTML
+ *
+ * @param DOMElement|string|null $element: ID of element to get HTML
from or false to get it from the whole tree
+ * @return string: Processed HTML
+ */
+ public function getText( $element = null ) {
+ wfProfileIn( __METHOD__ );
+
+ if ( $this->doc ) {
+ if ( $element !== null && !( $element instanceof
DOMElement ) ) {
+ $element = $this->doc->getElementById( $element
);
+ }
+ if ( $element ) {
+ $body = $this->doc->getElementsByTagName(
'body' )->item( 0 );
+ $nodesArray = array();
+ foreach ( $body->childNodes as $node ) {
+ $nodesArray[] = $node;
+ }
+ foreach ( $nodesArray as $nodeArray ) {
+ $body->removeChild( $nodeArray );
+ }
+ $body->appendChild( $element );
+ }
+ $html = $this->doc->saveHTML();
+ $html = $this->fixLibXml( $html );
+ } else {
+ $html = $this->html;
+ }
+ if ( wfIsWindows() ) {
+ $html = str_replace( ' ', '', $html );
+ }
+ $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s',
'', $html );
+ $html = $this->onHtmlReady( $html );
+
+ if ( $this->elementsToFlatten ) {
+ $elements = implode( '|', $this->elementsToFlatten );
+ $html = preg_replace( "#</?($elements)\\b[^>]*>#is",
'', $html );
+ }
+
+ wfProfileOut( __METHOD__ );
+ return $html;
+ }
+
+ /**
+ * @param $selector: CSS selector to parse
+ * @param $type
+ * @param $rawName
+ * @return bool: Whether the selector was successfully recognised
+ */
+ protected function parseSelector( $selector, &$type, &$rawName ) {
+ if ( strpos( $selector, '.' ) === 0 ) {
+ $type = 'CLASS';
+ $rawName = substr( $selector, 1 );
+ } elseif ( strpos( $selector, '#' ) === 0 ) {
+ $type = 'ID';
+ $rawName = substr( $selector, 1 );
+ } elseif ( strpos( $selector, '.' ) !== 0 &&
+ strpos( $selector, '.' ) !== false )
+ {
+ $type = 'TAG_CLASS';
+ $rawName = $selector;
+ } elseif ( strpos( $selector, '[' ) === false
+ && strpos( $selector, ']' ) === false )
+ {
+ $type = 'TAG';
+ $rawName = $selector;
+ } else {
+ throw new MWException( __METHOD__ . "(): unrecognized
selector '$selector'" );
+ }
+
+ return true;
+ }
+
+ /**
+ * Transforms CSS selectors into an internal representation suitable
for processing
+ * @return array
+ */
+ protected function parseItemsToRemove() {
+ wfProfileIn( __METHOD__ );
+ $removals = array(
+ 'ID' => array(),
+ 'TAG' => array(),
+ 'CLASS' => array(),
+ 'TAG_CLASS' => array(),
+ );
+
+ foreach ( $this->itemsToRemove as $itemToRemove ) {
+ $type = '';
+ $rawName = '';
+ if ( $this->parseSelector( $itemToRemove, $type,
$rawName ) ) {
+ $removals[$type][] = $rawName;
+ }
+ }
+
+ if ( $this->removeMedia ) {
+ $removals['TAG'][] = 'img';
+ $removals['TAG'][] = 'audio';
+ $removals['TAG'][] = 'video';
+ }
+
+ wfProfileOut( __METHOD__ );
+ return $removals;
+ }
+}
diff --git a/tests/phpunit/includes/HtmlFormatterTest.php
b/tests/phpunit/includes/HtmlFormatterTest.php
new file mode 100644
index 0000000..a37df74
--- /dev/null
+++ b/tests/phpunit/includes/HtmlFormatterTest.php
@@ -0,0 +1,81 @@
+<?php
+
+/**
+ * @group HtmlFormatter
+ */
+class HtmlFormatterTest extends MediaWikiTestCase {
+ /**
+ * @dataProvider getHtmlData
+ */
+ public function testTransform( $input, $expected, $callback = false ) {
+ $input = self::normalize( $input );
+ $formatter = new HtmlFormatter( HtmlFormatter::wrapHTML( $input
) );
+ if ( $callback ) {
+ $callback( $formatter );
+ }
+ $formatter->filterContent();
+ $html = $formatter->getText();
+ $this->assertEquals( self::normalize( $expected ),
self::normalize( $html ) );
+ }
+
+ private static function normalize( $s ) {
+ return str_replace( "\n", '',
+ str_replace( "\r", '', $s ) // "yay" to Windows!
+ );
+ }
+
+ public function getHtmlData() {
+ $removeImages = function( HtmlFormatter $f ) {
+ $f->setRemoveMedia();
+ };
+ $removeTags = function( HtmlFormatter $f ) {
+ $f->remove( array( 'table', '.foo', '#bar', 'div.baz' )
);
+ };
+ $flattenSomeStuff = function( HtmlFormatter $f ) {
+ $f->flatten( array( 's', 'div' ) );
+ };
+ $flattenEverything = function( HtmlFormatter $f ) {
+ $f->flattenAllTags();
+ };
+ return array(
+ // remove images if asked
+ array(
+ '<img src="/foo/bar.jpg" alt="Blah"/>',
+ '',
+ $removeImages,
+ ),
+ // basic tag removal
+ array(
+ '<table><tr><td>foo</td></tr></table><div
class="foo">foo</div><div class="foo quux">foo</div><span id="bar">bar</span>
+<strong class="foo" id="bar">foobar</strong><div class="notfoo">test</div><div
class="baz"/>
+<span class="baz">baz</span>',
+
+ '<div class="notfoo">test</div>
+<span class="baz">baz</span>',
+ $removeTags,
+ ),
+ // don't flatten tags that start like chosen ones
+ array(
+ '<div><s>foo</s> <span>bar</span></div>',
+ 'foo <span>bar</span>',
+ $flattenSomeStuff,
+ ),
+ // total flattening
+ array(
+ '<div style="foo">bar<sup>2</sup></div>',
+ 'bar2',
+ $flattenEverything,
+ ),
+ // UTF-8 preservation and security
+ array(
+ '<span title="" \'
&"><Тест!></span> &<&&&&',
+ '<span title="" \'
&"><Тест!></span> &<&&&&',
+ ),
+ // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
+ array(
+ 'Foo<sup id="cite_ref-1" class="reference"><a
href="#cite_note-1">[1]</a></sup> <a href="/wiki/Bar" title="Bar"
class="mw-redirect">Bar</a>',
+ 'Foo<sup id="cite_ref-1" class="reference"><a
href="#cite_note-1">[1]</a></sup> <a href="/wiki/Bar" title="Bar"
class="mw-redirect">Bar</a>',
+ ),
+ );
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/85899
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I4e3c5d9537b1ecffed0f62736784712187476bad
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: wmf/1.22wmf18
Gerrit-Owner: MaxSem <[email protected]>
Gerrit-Reviewer: Daniel Friesen <[email protected]>
Gerrit-Reviewer: MaxSem <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits