jenkins-bot has submitted this change and it was merged.

Change subject: Stop stripping HTML content and use library for decoding 
entities
......................................................................


Stop stripping HTML content and use library for decoding entities

Our HTML-stripping functions are unsafe and unnecessary and should be
removed.

Also, we should use a popular and actively maintained library for decoding
HTML entities rather than using a hand-rolled function.

Change-Id: Iaa737b0ac40f40d14c92fe44bcec7f96e170f99a
---
M lib/mobile-util.js
M lib/parseDefinition.js
M package.json
M test/features/definition/definition.js
M test/lib/mobile-util/mobile-util-test.js
5 files changed, 7 insertions(+), 53 deletions(-)

Approvals:
  BearND: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/mobile-util.js b/lib/mobile-util.js
index c6ef0e5..551df36 100644
--- a/lib/mobile-util.js
+++ b/lib/mobile-util.js
@@ -167,13 +167,6 @@
     return dateString + '/' + uuid.now().toString();
 };
 
-/**
- * Strip HTML markup from a string.
- */
-mUtil.stripMarkup = function(text) {
-    return text.replace(/<[^>]*>/g, '');
-};
-
 mUtil.throw404 = function(message) {
     throw new HTTPError({
         status: 404,
diff --git a/lib/parseDefinition.js b/lib/parseDefinition.js
index b76fa1a..8728a32 100644
--- a/lib/parseDefinition.js
+++ b/lib/parseDefinition.js
@@ -8,6 +8,7 @@
 'use strict';
 
 var domino = require('domino');
+var he = require('he');
 var sUtil = require('./util');
 var mUtil = require('./mobile-util');
 var transforms = require('./transforms');
@@ -64,21 +65,6 @@
                   */
                 };
 
-function stripSpanTags(text) {
-    return text.replace(/<\/?span[^>]*>/g, '');
-}
-
-function stripItalicTags(text) {
-    return text.replace(/<\/?i[^>]*>/g, '');
-}
-
-function decodeHTML(text) {
-    return text.replace(/&lt;/g,'<')
-               .replace(/&gt;/g,'>')
-               .replace(/&amp;/g,'&')
-               .replace(/&nbsp;/g, ' ');
-}
-
 function hasUsageExamples(text, langCode) {
     return langCode === 'en' ? text.indexOf('<dl') > -1 : text.indexOf('<ul') 
> -1;
 }
@@ -119,18 +105,11 @@
         currentDefinition.examples = [];
         examples = wikiLangCode === 'en' ? element.querySelectorAll('dd') : 
element.querySelectorAll('li');
         for (j = 0; j < examples.length; j++) {
-            example = examples[j].innerHTML.trim();
-            example = decodeHTML(example);
-            example = stripSpanTags(example);
-            example = stripItalicTags(example);
+            example = he.decode(examples[j].innerHTML.trim());
             currentDefinition.examples.push(example);
         }
     } else {
-        defn = element.innerHTML.trim();
-        defn = decodeHTML(defn);
-        defn = stripSpanTags(defn);
-        defn = stripItalicTags(defn);
-        currentDefinition.definition = defn;
+        currentDefinition.definition = he.decode(element.innerHTML.trim());
     }
     return currentDefinition;
 }
@@ -160,7 +139,7 @@
 
     for (j = 0; j < sectionDivs.length; j++) {
         currentSectionDiv = sectionDivs[j];
-        header = mUtil.stripMarkup(currentSectionDiv.title);
+        header = currentSectionDiv.title;
 
         /* Get the language from the first H2 header, and begin iterating over 
sections.
            Per the English Wiktionary style guide (linked in header above), H2 
headings
diff --git a/package.json b/package.json
index 4a5dad2..0490526 100644
--- a/package.json
+++ b/package.json
@@ -46,6 +46,7 @@
     "core-js": "^2.4.1",
     "domino": "^1.0.25",
     "express": "^4.14.0",
+    "he": "^1.1.0",
     "js-yaml": "^3.6.1",
     "locutus": "^2.0.5",
     "mediawiki-title": "^0.5.4",
diff --git a/test/features/definition/definition.js 
b/test/features/definition/definition.js
index 40c583b..af95843 100644
--- a/test/features/definition/definition.js
+++ b/test/features/definition/definition.js
@@ -36,7 +36,7 @@
                     }
                 }
                 assert.deepEqual(en[0].partOfSpeech, 'Noun');
-                assert.ok(en[0].definitions[0].definition.indexOf('An animal 
of the family <a href=\"/wiki/Felidae\">') === 0, 'Expected different start of 
definition');
+                assert.ok(en[0].definitions[0].definition.indexOf('An animal 
of the family <i><a href="/wiki/Felidae">Felidae</a></i>') === 0, 'Expected 
different start of definition');
                 assert.deepEqual(en[1].partOfSpeech, 'Verb');
                 assert.ok(en[1].definitions[0].definition.indexOf('To <a 
href=\"/wiki/hoist\">hoist</a>') === 0, 'Expected different start of 
definition');
             });
diff --git a/test/lib/mobile-util/mobile-util-test.js 
b/test/lib/mobile-util/mobile-util-test.js
index 9027296..93bb986 100644
--- a/test/lib/mobile-util/mobile-util-test.js
+++ b/test/lib/mobile-util/mobile-util-test.js
@@ -19,18 +19,6 @@
 var arr6 = [ obj1, obj3 ];
 var arr7 = [ obj4, obj2 ];
 
-// If a news story string breaks the regex in mUtil.stripMarkup, fix the regex,
-// then add the offending string here for use in future unit testing!
-var newsHtml1 = '<!--June 20 --> In <a rel="mw:WikiLink" href="./Basketball" 
title="Basketball" id="mwBA">basketball</a>, the <a rel="mw:WikiLink" 
href="./Cleveland_Cavaliers" title="Cleveland Cavaliers" id="mwBQ">Cleveland 
Cavaliers</a> defeat the <a rel="mw:WikiLink" href="./Golden_State_Warriors" 
title="Golden State Warriors" id="mwBg">Golden State Warriors</a> to win the <b 
id="mwBw"><a rel="mw:WikiLink" href="./2016_NBA_Finals" title="2016 NBA Finals" 
id="mwCA">NBA Finals</a></b> <i id="mwCQ">(<a rel="mw:WikiLink" 
href="./Bill_Russell_NBA_Finals_Most_Valuable_Player_Award" title="Bill Russell 
NBA Finals Most Valuable Player Award" id="mwCg">MVP</a> <a rel="mw:WikiLink" 
href="./LeBron_James" title="LeBron James" id="mwCw">LeBron James</a> 
pictured)</i>.';
-var newsHtml2 = '<!--June 18--> The <a rel="mw:WikiLink" 
href="./United_Nations" title="United Nations" id="mwDQ">United Nations</a> 
reports 80,000 civilians have fled <a rel="mw:WikiLink" href="./Fallujah" 
title="Fallujah" id="mwDg">Fallujah</a>, as <a rel="mw:WikiLink" 
href="./Iraqi_Armed_Forces" title="Iraqi Armed Forces" id="mwDw">Iraqi Army</a> 
and <a rel="mw:WikiLink" href="./Popular_Mobilization_Forces_(Iraq)" 
title="Popular Mobilization Forces (Iraq)" id="mwEA">Shia militias</a> <b 
id="mwEQ"><a rel="mw:WikiLink" href="./Battle_of_Fallujah_(2016)" title="Battle 
of Fallujah (2016)" id="mwEg">retake most of the city</a></b> from <a 
rel="mw:WikiLink" href="./Islamic_State_of_Iraq_and_the_Levant" title="Islamic 
State of Iraq and the Levant" id="mwEw">ISIL</a>.';
-var newsHtml3 = '<!--June 16--> British <a rel="mw:WikiLink" 
href="./Member_of_parliament" title="Member of parliament" id="mwFQ">Member of 
Parliament</a> <b id="mwFg"><a rel="mw:WikiLink" href="./Jo_Cox" title="Jo Cox" 
id="mwFw">Jo Cox</a></b> dies after being <a rel="mw:WikiLink" 
href="./Killing_of_Jo_Cox" title="Killing of Jo Cox" id="mwGA">shot and 
stabbed</a> in <a rel="mw:WikiLink" href="./Birstall,_West_Yorkshire" 
title="Birstall, West Yorkshire" id="mwGQ">Birstall, West Yorkshire</a>.';
-var newsHtml4 = '<!--June 14--> In <a rel="mw:WikiLink" 
href="./Association_football" title="Association football" 
id="mwGw">association football</a>, <b id="mwHA"><a rel="mw:WikiLink" 
href="./Violence_at_UEFA_Euro_2016" title="Violence at UEFA Euro 2016" 
id="mwHQ">fan violence</a></b> at <a rel="mw:WikiLink" href="./UEFA_Euro_2016" 
title="UEFA Euro 2016" id="mwHg">UEFA Euro 2016</a> results in arrests and 
deportation of fans, and fines for the <a rel="mw:WikiLink" 
href="./Croatian_national_football_team" title="Croatian national football 
team" id="mwHw">Croatian</a>, <a rel="mw:WikiLink" 
href="./Hungarian_national_football_team" title="Hungarian national football 
team" id="mwIA">Hungarian</a> and <a rel="mw:WikiLink" 
href="./Russia_national_football_team" title="Russia national football team" 
id="mwIQ">Russian</a> teams.';
-
-var newsText1 = 'In basketball, the Cleveland Cavaliers defeat the Golden 
State Warriors to win the NBA Finals (MVP LeBron James pictured).';
-var newsText2 = 'The United Nations reports 80,000 civilians have fled 
Fallujah, as Iraqi Army and Shia militias retake most of the city from ISIL.';
-var newsText3 = 'British Member of Parliament Jo Cox dies after being shot and 
stabbed in Birstall, West Yorkshire.';
-var newsText4 = 'In association football, fan violence at UEFA Euro 2016 
results in arrests and deportation of fans, and fines for the Croatian, 
Hungarian and Russian teams.';
-
 describe('lib:mobile-util', function() {
     this.timeout(20000);
 
@@ -54,12 +42,5 @@
     it('fillInMemberKeys should make the specified adjustments', function() {
         mUtil.fillInMemberKeys(arr3, [['world', 'goodbye'], ['again', 'sea']]);
         assert.deepEqual(arr3, arr4);
-    })
-
-    it('HTML stripping regex should strip HTML as expected', function() {
-        assert.deepEqual(mUtil.stripMarkup(newsHtml1).trim(), newsText1);
-        assert.deepEqual(mUtil.stripMarkup(newsHtml2).trim(), newsText2);
-        assert.deepEqual(mUtil.stripMarkup(newsHtml3).trim(), newsText3);
-        assert.deepEqual(mUtil.stripMarkup(newsHtml4).trim(), newsText4);
-    })
+    });
 });

-- 
To view, visit https://gerrit.wikimedia.org/r/302288
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Iaa737b0ac40f40d14c92fe44bcec7f96e170f99a
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Mholloway <[email protected]>
Gerrit-Reviewer: BearND <[email protected]>
Gerrit-Reviewer: Dbrant <[email protected]>
Gerrit-Reviewer: Fjalapeno <[email protected]>
Gerrit-Reviewer: GWicke <[email protected]>
Gerrit-Reviewer: Jhernandez <[email protected]>
Gerrit-Reviewer: Mholloway <[email protected]>
Gerrit-Reviewer: Mhurd <[email protected]>
Gerrit-Reviewer: Mobrovac <[email protected]>
Gerrit-Reviewer: Niedzielski <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to