Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/259430

Change subject: WIP: Normalize DOM by stripping \u200e, \u200f next to category 
links
......................................................................

WIP: Normalize DOM by stripping \u200e, \u200f next to category links

* To be discussed: whether to do this unconditionally
  or only when scrubWikitext is enabled.

Change-Id: I9eaf81d6e27429d4a1d9f7a2465f31259b0ae0e3
---
M lib/html2wt/normalizeDOM.js
M tests/parserTests.txt
2 files changed, 63 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/30/259430/1

diff --git a/lib/html2wt/normalizeDOM.js b/lib/html2wt/normalizeDOM.js
index 9caeaaa..118d4a7 100644
--- a/lib/html2wt/normalizeDOM.js
+++ b/lib/html2wt/normalizeDOM.js
@@ -247,8 +247,37 @@
 };
 
 /**
- * Normalizations implemented right now:
- * -------------------------------------
+ * Unconditional normalizations implemented right now:
+ * ---------------------------------------------------
+ * 1. Remove implicit directionality marker chars (\u200e, \u200f)
+ *    around categories.
+ */
+Normalizer.prototype.normalizeNodeUnconditionally = function(node) {
+       if (!DU.isText(node) ||
+               (!DU.isCategoryLink(node.previousSibling) && 
!DU.isCategoryLink(node.nextSibling))) {
+               // Not a text node and not adjacent to a category link
+               return node;
+       }
+
+       var prev = node.previousSibling;
+       var next = node.nextSibling;
+       if (!next || DU.isCategoryLink(next)) {
+               // The following can leave behind an empty text node.
+               node.nodeValue = 
node.nodeValue.replace(/([\u200e\u200f]+\n)?[\u200e\u200f]+$/g, '');
+               // Treat modified node as having been newly inserted
+               this.addDiffMarks(node, 'inserted');
+               this.addDiffMarks(node.parentNode, 'children-changed');
+               // Log for editors benefit
+               this.env.log('warning/html2wt/bidi',
+                       'LRM/RLM unicode chars stripped around categories');
+       }
+
+       return node;
+};
+
+/**
+ * scrubWikitext normalizations implemented right now:
+ * ---------------------------------------------------
  * 1. Tag minimization (I/B tags) in normalizeSiblingPair
  * 2. Strip empty headings and style tags
  * 3. Force SOL transparent links to serialize before/after heading
@@ -256,8 +285,8 @@
  * 5. Space is added before escapable prefixes in table cells
  * 6. Strip <br/> from headings
  */
-Normalizer.prototype.normalizeNode = function(node) {
-       // Only if scrubWikitext flag is enabled
+Normalizer.prototype.normalizeNodeConditionally = function(node) {
+       // The following are done only if scrubWikitext flag is enabled
        if (!this.env.scrubWikitext) {
                return node;
        }
@@ -468,7 +497,8 @@
                        this.processSubtree(node, true);
                }
 
-               next = this.normalizeNode(node);
+               next = this.normalizeNodeUnconditionally(node);
+               next = this.normalizeNodeConditionally(node);
 
                // Clear insertion marker
                if (insertedSubtree) {
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index 2581218..29b4fa4 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -25784,6 +25784,34 @@
 <p>foo <span about="#mwt1" typeof="mw:Transclusion" 
data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":"&lt;span>bar&lt;/span>
 [[Category:baz]]"}},"i":0}}]}'>bar</span><span about="#mwt1"> </span><link 
rel="mw:PageProp/Category" href="./Category:Baz" about="#mwt1" 
data-parsoid='{"stx":"simple","a":{"href":"./Category:Baz"},"sa":{"href":"Category:baz"}}'/>
 bar</p>
 !! end
 
+# Careful while editing this test. There are \u200f characters
+# before and after the <link> tags in the HTML.
+# Do not remove these characters in edits.
+#
+# As part of the serialization, these bidi characters will get stripped.
+!! test
+RTL (\u200f) and LTR (\u200e) markers around category tags should be stripped
+!! options
+parsoid=html2wt
+!! html/parsoid
+<p>‏<link rel="mw:PageProp/Category" href="./קטגוריה:טקסים" />‏
+‏<link rel="mw:PageProp/Category" href="./קטגוריה:_שיטות_משפט" />‏</p>
+!! wikitext
+[[קטגוריה:טקסים]]
+[[קטגוריה: שיטות משפט]]
+!! end
+
+!! test
+RTL (\u200f) and LTR (\u200e) markers should not be stripped if followed by a 
text node
+!! options
+parsoid=html2wt
+!! html/parsoid
+<p><link rel="mw:PageProp/Category" href="./קטגוריה:טקסים" />‏y</p>
+!! wikitext
+[[קטגוריה:טקסים]]
+‏y
+!! end
+
 !! test
 Lists: Add space after bullets
 !! options

-- 
To view, visit https://gerrit.wikimedia.org/r/259430
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I9eaf81d6e27429d4a1d9f7a2465f31259b0ae0e3
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to