Santhosh has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/363156 )

Change subject: MT: Sanitize HTML output from machine translation services
......................................................................

MT: Sanitize HTML output from machine translation services

Uses DOMPurify.

DOMPurify need a DOM implementation. jsdom is used and defined as
dependency.

Some MT tests updated since attribute orders changed in sanitized
output.
Bug: T169295

Change-Id: I25c533e1ad7fe1b70937edf0fdbcae4b03b570fa
---
M mt/MTClient.js
M package.json
M test/mt/Apertium.test.js
3 files changed, 29 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/56/363156/1

diff --git a/mt/MTClient.js b/mt/MTClient.js
index 7afa926..3918f37 100644
--- a/mt/MTClient.js
+++ b/mt/MTClient.js
@@ -2,7 +2,9 @@
 
 var LinearDoc = require( __dirname + '/../lineardoc' ),
        BBPromise = require( 'bluebird' ),
-       SubSequenceMatcher = require( 
'./annotationmapper/SubsequenceMatcher.js' );
+       SubSequenceMatcher = require( 
'./annotationmapper/SubsequenceMatcher.js' ),
+       createDOMPurify = require( 'dompurify' ),
+       jsdom = require( 'jsdom' );
 
 /**
  * MTClient - Generic machine translation client.
@@ -85,7 +87,28 @@
 
        return BBPromise.all( chain ).then( function ( results ) {
                targetDoc.items = results;
-               return targetDoc.getHtml();
+               // Return sanitized HTML output
+               return self.sanitize( targetDoc.getHtml() );
+       } );
+};
+
+/**
+ * Sanitize given HTML using DOMPurify
+ * @param {string} html Dirty HTML
+ * @return {string} sanitized HTML output
+ */
+MTClient.prototype.sanitize = function ( html ) {
+       if ( !this.DOMPurify ) {
+               // Lazy initialize DOMPurify
+               this.DOMPurify = createDOMPurify( ( new jsdom.JSDOM( '' ) 
).window );
+       }
+
+       if ( !this.DOMPurify.isSupported ) {
+               throw new Error( 'DOMPurify not suppported in the DOM 
environment provided by JSDOM' );
+       }
+
+       return this.DOMPurify.sanitize( html, {
+               ADD_URI_SAFE_ATTR: [ 'rel' ] // Without this rel="mw:WikiLink" 
attributes will be removed.
        } );
 };
 
diff --git a/package.json b/package.json
index 9e5a11d..c70b1a0 100644
--- a/package.json
+++ b/package.json
@@ -30,7 +30,9 @@
     "preq": "^0.5.2",
     "service-runner": "^2.2.5",
     "swagger-router": "^0.4.6",
-    "swagger-ui": "git+https://github.com/wikimedia/swagger-ui#master";
+    "swagger-ui": "git+https://github.com/wikimedia/swagger-ui#master";,
+    "dompurify": "^0.9.0",
+    "jsdom": "^10.1.0"
   },
   "devDependencies": {
     "async": "^1.4.2",
diff --git a/test/mt/Apertium.test.js b/test/mt/Apertium.test.js
index 722c8bf..90c7091 100644
--- a/test/mt/Apertium.test.js
+++ b/test/mt/Apertium.test.js
@@ -84,7 +84,7 @@
        {
                title: 'Find longest match among multiple matches',
                source: '<p id="8"><span class="cx-segment" 
data-segmentid="9"><a class="cx-link" data-linkid="17" 
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The 
New York Times</a>, which has an <b>executive editor</b> over the news pages 
and an <b>editorial page editor</b> over opinion pages.</span></p>',
-               target: '<p id="8"><span class="cx-segment" 
data-segmentid="9"><a class="cx-link" data-linkid="17" 
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The 
New York Times</a>, el cual tiene un <b>editor ejecutivo</b> sobre las páginas 
noticiosas y un <b>editor de página del editorial</b> encima páginas de 
opinión.</span></p>',
+               target: '<p id="8"><span data-segmentid="9" 
class="cx-segment"><a title="The New York Times" rel="mw:WikiLink" 
href="./The_New_York_Times" data-linkid="17" class="cx-link">The New York 
Times</a>, el cual tiene un <b>editor ejecutivo</b> sobre las páginas 
noticiosas y un <b>editor de página del editorial</b> encima páginas de 
opinión.</span></p>',
                textTranslations: {
                        'The New York Times, which has an executive editor over 
the news pages and an editorial page editor over opinion pages.': 'The New York 
Times, el cual tiene un editor ejecutivo sobre las páginas noticiosas y un 
editor de página del editorial encima páginas de opinión.',
                        'The New York Times': 'The New York Times',

-- 
To view, visit https://gerrit.wikimedia.org/r/363156
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I25c533e1ad7fe1b70937edf0fdbcae4b03b570fa
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to