BearND has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/404514 )
Change subject: summary: reorder summarizer transforms
......................................................................
summary: reorder summarizer transforms
Doing the regex manipulations first, then the DOM transformations to
make sure no attributes we don't want get through.
Bug: T184557
Change-Id: Ic273b63bc43e0841892a215a49b23d93099b228c
---
M lib/transformations/summarize.js
1 file changed, 11 insertions(+), 13 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps
refs/changes/14/404514/1
diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js
index 2b70ece..6b0cff7 100644
--- a/lib/transformations/summarize.js
+++ b/lib/transformations/summarize.js
@@ -106,17 +106,6 @@
* @return {!object} html summary
*/
module.exports = function(html) {
- const doc = domino.createDocument(html);
- flattenElements(doc, 'a');
- removeAttributes(doc, '*', ['data-mw']);
- rmElementsWithSelector(doc, '.mw-ref, .reference');
- rmElementsWithSelector(doc, '.noexcerpt');
- rmElementsWithSelector(doc, '.noprint');
- rmElementsWithSelector(doc, 'math');
- rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty');
- removeUnwantedNodes(doc);
-
- html = doc.body.innerHTML;
html = removeNestedParentheticals(html);
// 1. Replace any parentheticals which have at least one space inside
html = html.replace(/\([^)]+ [^)]+\)/g, ' ');
@@ -139,9 +128,18 @@
// 6. Same as 5 but for non-latin comma and no space afterwards
html = html.replace(/ ,/g, ',');
- doc.body.innerHTML = html;
+ const doc = domino.createDocument(html);
+ flattenElements(doc, 'a');
+ removeAttributes(doc, '*', ['data-mw']);
+ rmElementsWithSelector(doc, '.mw-ref, .reference');
+ rmElementsWithSelector(doc, '.noexcerpt');
+ rmElementsWithSelector(doc, '.noprint');
+ rmElementsWithSelector(doc, 'math');
+ rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty');
+ removeUnwantedNodes(doc);
+
return {
extract: doc.body.textContent,
- extract_html: html
+ extract_html: doc.body.innerHTML
};
};
--
To view, visit https://gerrit.wikimedia.org/r/404514
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic273b63bc43e0841892a215a49b23d93099b228c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: BearND <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits