BearND has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/404514 )

Change subject: summary: reorder summarizer transforms
......................................................................

summary: reorder summarizer transforms

Doing the regex manipulations first, then the DOM transformations to
make sure no attributes we don't want get through.

Bug: T184557
Change-Id: Ic273b63bc43e0841892a215a49b23d93099b228c
---
M lib/transformations/summarize.js
1 file changed, 11 insertions(+), 13 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/14/404514/1

diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js
index 2b70ece..6b0cff7 100644
--- a/lib/transformations/summarize.js
+++ b/lib/transformations/summarize.js
@@ -106,17 +106,6 @@
  * @return {!object} html summary
  */
 module.exports = function(html) {
-    const doc = domino.createDocument(html);
-    flattenElements(doc, 'a');
-    removeAttributes(doc, '*', ['data-mw']);
-    rmElementsWithSelector(doc, '.mw-ref, .reference');
-    rmElementsWithSelector(doc, '.noexcerpt');
-    rmElementsWithSelector(doc, '.noprint');
-    rmElementsWithSelector(doc, 'math');
-    rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty');
-    removeUnwantedNodes(doc);
-
-    html = doc.body.innerHTML;
     html = removeNestedParentheticals(html);
     // 1. Replace any parentheticals which have at least one space inside
     html = html.replace(/\([^)]+ [^)]+\)/g, ' ');
@@ -139,9 +128,18 @@
     // 6. Same as 5 but for non-latin comma and no space afterwards
     html = html.replace(/ ,/g, ',');
 
-    doc.body.innerHTML = html;
+    const doc = domino.createDocument(html);
+    flattenElements(doc, 'a');
+    removeAttributes(doc, '*', ['data-mw']);
+    rmElementsWithSelector(doc, '.mw-ref, .reference');
+    rmElementsWithSelector(doc, '.noexcerpt');
+    rmElementsWithSelector(doc, '.noprint');
+    rmElementsWithSelector(doc, 'math');
+    rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty');
+    removeUnwantedNodes(doc);
+
     return {
         extract: doc.body.textContent,
-        extract_html: html
+        extract_html: doc.body.innerHTML
     };
 };

-- 
To view, visit https://gerrit.wikimedia.org/r/404514
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic273b63bc43e0841892a215a49b23d93099b228c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: BearND <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to