Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/259430
Change subject: WIP: Normalize DOM by stripping \u200e, \u200f next to category
links
......................................................................
WIP: Normalize DOM by stripping \u200e, \u200f next to category links
* To be discussed: whether to do this unconditionally
or only when scrubWikitext is enabled.
Change-Id: I9eaf81d6e27429d4a1d9f7a2465f31259b0ae0e3
---
M lib/html2wt/normalizeDOM.js
M tests/parserTests.txt
2 files changed, 63 insertions(+), 5 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/30/259430/1
diff --git a/lib/html2wt/normalizeDOM.js b/lib/html2wt/normalizeDOM.js
index 9caeaaa..118d4a7 100644
--- a/lib/html2wt/normalizeDOM.js
+++ b/lib/html2wt/normalizeDOM.js
@@ -247,8 +247,37 @@
};
/**
- * Normalizations implemented right now:
- * -------------------------------------
+ * Unconditional normalizations implemented right now:
+ * ---------------------------------------------------
+ * 1. Remove implicit directionality marker chars (\u200e, \u200f)
+ * around categories.
+ */
+Normalizer.prototype.normalizeNodeUnconditionally = function(node) {
+ if (!DU.isText(node) ||
+ (!DU.isCategoryLink(node.previousSibling) &&
!DU.isCategoryLink(node.nextSibling))) {
+ // Not a text node and not adjacent to a category link
+ return node;
+ }
+
+ var prev = node.previousSibling;
+ var next = node.nextSibling;
+ if (!next || DU.isCategoryLink(next)) {
+ // The following can leave behind an empty text node.
+ node.nodeValue =
node.nodeValue.replace(/([\u200e\u200f]+\n)?[\u200e\u200f]+$/g, '');
+ // Treat modified node as having been newly inserted
+ this.addDiffMarks(node, 'inserted');
+ this.addDiffMarks(node.parentNode, 'children-changed');
+ // Log for editors benefit
+ this.env.log('warning/html2wt/bidi',
+ 'LRM/RLM unicode chars stripped around categories');
+ }
+
+ return node;
+};
+
+/**
+ * scrubWikitext normalizations implemented right now:
+ * ---------------------------------------------------
* 1. Tag minimization (I/B tags) in normalizeSiblingPair
* 2. Strip empty headings and style tags
* 3. Force SOL transparent links to serialize before/after heading
@@ -256,8 +285,8 @@
* 5. Space is added before escapable prefixes in table cells
* 6. Strip <br/> from headings
*/
-Normalizer.prototype.normalizeNode = function(node) {
- // Only if scrubWikitext flag is enabled
+Normalizer.prototype.normalizeNodeConditionally = function(node) {
+ // The following are done only if scrubWikitext flag is enabled
if (!this.env.scrubWikitext) {
return node;
}
@@ -468,7 +497,8 @@
this.processSubtree(node, true);
}
- next = this.normalizeNode(node);
+ next = this.normalizeNodeUnconditionally(node);
+ next = this.normalizeNodeConditionally(node);
// Clear insertion marker
if (insertedSubtree) {
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index 2581218..29b4fa4 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -25784,6 +25784,34 @@
<p>foo <span about="#mwt1" typeof="mw:Transclusion"
data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":"<span>bar</span>
[[Category:baz]]"}},"i":0}}]}'>bar</span><span about="#mwt1"> </span><link
rel="mw:PageProp/Category" href="./Category:Baz" about="#mwt1"
data-parsoid='{"stx":"simple","a":{"href":"./Category:Baz"},"sa":{"href":"Category:baz"}}'/>
bar</p>
!! end
+# Careful while editing this test. There are \u200f characters
+# before and after the <link> tags in the HTML.
+# Do not remove these characters in edits.
+#
+# As part of the serialization, these bidi characters will get stripped.
+!! test
+RTL (\u200f) and LTR (\u200e) markers around category tags should be stripped
+!! options
+parsoid=html2wt
+!! html/parsoid
+<p><link rel="mw:PageProp/Category" href="./קטגוריה:טקסים" />
+<link rel="mw:PageProp/Category" href="./קטגוריה:_שיטות_משפט" /></p>
+!! wikitext
+[[קטגוריה:טקסים]]
+[[קטגוריה: שיטות משפט]]
+!! end
+
+!! test
+RTL (\u200f) and LTR (\u200e) markers should not be stripped if followed by a
text node
+!! options
+parsoid=html2wt
+!! html/parsoid
+<p><link rel="mw:PageProp/Category" href="./קטגוריה:טקסים" />y</p>
+!! wikitext
+[[קטגוריה:טקסים]]
+y
+!! end
+
!! test
Lists: Add space after bullets
!! options
--
To view, visit https://gerrit.wikimedia.org/r/259430
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I9eaf81d6e27429d4a1d9f7a2465f31259b0ae0e3
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits