Santhosh has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/363156 )
Change subject: MT: Sanitize HTML output from machine translation services
......................................................................
MT: Sanitize HTML output from machine translation services
Uses DOMPurify.
DOMPurify need a DOM implementation. jsdom is used and defined as
dependency.
Some MT tests updated since attribute orders changed in sanitized
output.
Bug: T169295
Change-Id: I25c533e1ad7fe1b70937edf0fdbcae4b03b570fa
---
M mt/MTClient.js
M package.json
M test/mt/Apertium.test.js
3 files changed, 29 insertions(+), 4 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver
refs/changes/56/363156/1
diff --git a/mt/MTClient.js b/mt/MTClient.js
index 7afa926..3918f37 100644
--- a/mt/MTClient.js
+++ b/mt/MTClient.js
@@ -2,7 +2,9 @@
var LinearDoc = require( __dirname + '/../lineardoc' ),
BBPromise = require( 'bluebird' ),
- SubSequenceMatcher = require(
'./annotationmapper/SubsequenceMatcher.js' );
+ SubSequenceMatcher = require(
'./annotationmapper/SubsequenceMatcher.js' ),
+ createDOMPurify = require( 'dompurify' ),
+ jsdom = require( 'jsdom' );
/**
* MTClient - Generic machine translation client.
@@ -85,7 +87,28 @@
return BBPromise.all( chain ).then( function ( results ) {
targetDoc.items = results;
- return targetDoc.getHtml();
+ // Return sanitized HTML output
+ return self.sanitize( targetDoc.getHtml() );
+ } );
+};
+
+/**
+ * Sanitize given HTML using DOMPurify
+ * @param {string} html Dirty HTML
+ * @return {string} sanitized HTML output
+ */
+MTClient.prototype.sanitize = function ( html ) {
+ if ( !this.DOMPurify ) {
+ // Lazy initialize DOMPurify
+ this.DOMPurify = createDOMPurify( ( new jsdom.JSDOM( '' )
).window );
+ }
+
+ if ( !this.DOMPurify.isSupported ) {
+ throw new Error( 'DOMPurify not suppported in the DOM
environment provided by JSDOM' );
+ }
+
+ return this.DOMPurify.sanitize( html, {
+ ADD_URI_SAFE_ATTR: [ 'rel' ] // Without this rel="mw:WikiLink"
attributes will be removed.
} );
};
diff --git a/package.json b/package.json
index 9e5a11d..c70b1a0 100644
--- a/package.json
+++ b/package.json
@@ -30,7 +30,9 @@
"preq": "^0.5.2",
"service-runner": "^2.2.5",
"swagger-router": "^0.4.6",
- "swagger-ui": "git+https://github.com/wikimedia/swagger-ui#master"
+ "swagger-ui": "git+https://github.com/wikimedia/swagger-ui#master",
+ "dompurify": "^0.9.0",
+ "jsdom": "^10.1.0"
},
"devDependencies": {
"async": "^1.4.2",
diff --git a/test/mt/Apertium.test.js b/test/mt/Apertium.test.js
index 722c8bf..90c7091 100644
--- a/test/mt/Apertium.test.js
+++ b/test/mt/Apertium.test.js
@@ -84,7 +84,7 @@
{
title: 'Find longest match among multiple matches',
source: '<p id="8"><span class="cx-segment"
data-segmentid="9"><a class="cx-link" data-linkid="17"
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The
New York Times</a>, which has an <b>executive editor</b> over the news pages
and an <b>editorial page editor</b> over opinion pages.</span></p>',
- target: '<p id="8"><span class="cx-segment"
data-segmentid="9"><a class="cx-link" data-linkid="17"
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The
New York Times</a>, el cual tiene un <b>editor ejecutivo</b> sobre las páginas
noticiosas y un <b>editor de página del editorial</b> encima páginas de
opinión.</span></p>',
+ target: '<p id="8"><span data-segmentid="9"
class="cx-segment"><a title="The New York Times" rel="mw:WikiLink"
href="./The_New_York_Times" data-linkid="17" class="cx-link">The New York
Times</a>, el cual tiene un <b>editor ejecutivo</b> sobre las páginas
noticiosas y un <b>editor de página del editorial</b> encima páginas de
opinión.</span></p>',
textTranslations: {
'The New York Times, which has an executive editor over
the news pages and an editorial page editor over opinion pages.': 'The New York
Times, el cual tiene un editor ejecutivo sobre las páginas noticiosas y un
editor de página del editorial encima páginas de opinión.',
'The New York Times': 'The New York Times',
--
To view, visit https://gerrit.wikimedia.org/r/363156
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I25c533e1ad7fe1b70937edf0fdbcae4b03b570fa
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits