[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Refactor and document seperator serialization

jenkins-bot (Code Review) Wed, 25 Oct 2017 14:03:10 -0700

jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/380899 )


Change subject: Refactor and document seperator serialization
......................................................................


Refactor and document seperator serialization

 * The impetus here was to answer if it's safe to call `emitChunk`
   multiple times per node.

   It should be, and indeed we do in several places.  The first call
   resets the separator state, which clears the constraints.  Guards
   are made explicit here so that when it's called `again`, original
   separators won't get reused.

 * The legitimate case for making subsequent calls with the same node,
   (ie. `lastSourceNode` being the same) is when there're no children
   and an end tag must go out.  See the note in the table handler where
   necessary contraints are shoehorned in.

 * Removes `lastSourceSep` which has been obsolete since 8c95d797.
   That refactor put the update after resetting so that it was always
   effectively empty.  Presumably that was the right thing to do.

Change-Id: I92953d00580bf8ad7d5e34a0d804931991da1e43
---
M lib/html2wt/DOMHandlers.js
M lib/html2wt/LinkHandler.js
M lib/html2wt/SerializerState.js
M lib/html2wt/WikitextSerializer.js
M lib/html2wt/separators.js
M tests/parserTests.txt
6 files changed, 152 insertions(+), 114 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/html2wt/DOMHandlers.js b/lib/html2wt/DOMHandlers.js
index f1731b7..3165aa6 100644
--- a/lib/html2wt/DOMHandlers.js
+++ b/lib/html2wt/DOMHandlers.js
@@ -551,7 +551,7 @@
        // FIXME: Should this also check for tabs and plain space
        // chars interspersed with newlines?
        if (dp.src.match(/^\n+$/)) {
-               state.setSep((state.sep.src || '') + dp.src);
+               state.appendSep(dp.src);
        } else {
                state.serializer.emitWikitext(dp.src, node);
        }
@@ -689,7 +689,7 @@
                                }
                                if (!state.sep.constraints) {
                                        // Special case hack for "{|\n|}" since 
state.sep is
-                                       // cleared in SSP.pushSep after a 
separator is emitted.
+                                       // cleared in SSP.emitSep after a 
separator is emitted.
                                        // However, for {|\n|}, the <table> tag 
has no element
                                        // children which means lastchild -> 
parent constraint
                                        // is never computed and set here.
@@ -1005,7 +1005,7 @@
                                state.emitChunk(content, node);
 
                                // Preserve separator source
-                               state.setSep((trailingNL && trailingNL[0]) || 
'');
+                               state.appendSep((trailingNL && trailingNL[0]) 
|| '');
                        });
                }),
                sepnls: {
diff --git a/lib/html2wt/LinkHandler.js b/lib/html2wt/LinkHandler.js
index 6b52024..ad3dd31 100644
--- a/lib/html2wt/LinkHandler.js
+++ b/lib/html2wt/LinkHandler.js
@@ -665,7 +665,7 @@
                                // content string wasn't appropriate).
                                state.emitChunk(magicLinkMatch[0] === 'ISBN' ?
                                        new WikiLinkText(serialized, node, 
wiki, 'mw:WikiLink') :
-                                       new ExtLinkText(serialized, node, wiki, 
'mw:ExtLink'));
+                                       new ExtLinkText(serialized, node, wiki, 
'mw:ExtLink'), node);
                        } else {
                                state.emitChunk(new MagicLinkText(serialized, 
node), node);
                        }
diff --git a/lib/html2wt/SerializerState.js b/lib/html2wt/SerializerState.js
index eb93757..f9b3b94 100644
--- a/lib/html2wt/SerializerState.js
+++ b/lib/html2wt/SerializerState.js
@@ -24,7 +24,12 @@
  *    Separator information:
  *    - constraints: min/max number of newlines
  *    - text: collected separator text from DOM text/comment nodes
- *    - lastSourceNode: -- to be documented --
+ *    - lastSourceNode: Seems to be bookkeeping to make sure we don't reuse
+ *                      original separators when `emitChunk` is called
+ *                      consecutively on the same node.  However, it also
+ *                      differs from `state.prevNode` in that it only gets
+ *                      updated when a node calls `emitChunk` so that nodes
+ *                      serializing `justChildren` don't mix up `buildSep`.
  *
  * onSOL
  *    Is the serializer at the start of a new wikitext line?
@@ -161,20 +166,22 @@
 };
 
 /**
+ * Appends the seperator source and updates the SOL state if necessary
  */
-SSP.setSep = function(src) {
-       this.sep.src = src;
+SSP.appendSep = function(src) {
+       this.sep.src = (this.sep.src || '') + src;
        this.sepIntroducedSOL(src);
 };
 
 /**
+ * Cycle the state after processing a node
  */
 SSP.updateSep = function(node) {
        this.sep.lastSourceNode = node;
-       this.sep.lastSourceSep = this.sep.src;
 };
 
 /**
+ * Reset the current line state
  */
 SSP.resetCurrLine = function(node) {
        this.currLine = {
@@ -218,6 +225,7 @@
 });
 
 /**
+ * Extracts a subset of the page source bound by the supplied indices
  */
 SSP.getOrigSrc = function(start, end) {
        console.assert(this.selserMode);
@@ -225,6 +233,7 @@
 };
 
 /**
+ * Like it says on the tin
  */
 SSP.updateModificationFlags = function(node) {
        this.prevNodeUnmodified = this.currNodeUnmodified;
@@ -257,15 +266,17 @@
 };
 
 /**
+ * Accumulates chunks on the current line
  */
-SSP.push = function(text, node) {
+SSP.pushToCurrLine = function(text, node) {
        console.assert(text instanceof ConstrainedText);
        this.currLine.chunks.push(text);
 };
 
 /**
+ * Pushes the seperator to the current line and resets the separator state
  */
-SSP.pushSep = function(sep, node, debugPrefix) {
+SSP.emitSep = function(sep, node, debugPrefix) {
        if (!(sep instanceof String)) {
                this.env.log('warning', 'Emitting non-string value: ' + 
String(sep));
        }
@@ -276,10 +287,12 @@
                sep.text = sep.text.replace(/\n/g, ' ');
        }
 
-       this.push(sep, node);
+       this.pushToCurrLine(sep, node);
 
        // Reset separator state
        this.sep = {};
+       this.updateSep(node);
+
        this.sepIntroducedSOL(sep.text);
 
        this.env.log(this.serializer.logType,
@@ -287,6 +300,85 @@
                function() { return JSON.stringify(sep.text); });
 };
 
+/**
+ * Determines if we can use the original seperator for this node or if we
+ * need to build one based on its constraints, and then emits it.
+ *
+ * The following comment applies to `origSepUsable` but is placed outside the
+ * function body since character count (including comments) can prevent
+ * inlining in older versions of v8 (node < 8.3)
+ *
+ * ---
+ *
+ * When block nodes are deleted, the deletion affects whether unmodified
+ * newline separators between a pair of unmodified P tags can be reused.
+ *
+ * Example:
+ * Original WT  : "<div>x</div>foo\nbar"
+ * Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>"
+ * Edited HTML  : "<p>foo</p>\n<p>bar</p>"
+ * Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>"
+ * Expected WT  : "foo\n\nbar"
+ *
+ * Note the additional newline between "foo" and "bar" even though originally,
+ * there was just a single newline.
+ *
+ * So, even though the two P tags and the separator between them is
+ * unmodified, it is insufficient to rely on just that. We have to look at
+ * what has happened on the two wikitext lines onto which the two P tags
+ * will get serialized.
+ *
+ * Now, if you check the code for 'nextToDeletedBlockNodeInWT', that code is
+ * not really looking at ALL the nodes before/after the nodes that could
+ * serialize onto the wikitext lines. It is looking at the immediately
+ * adjacent nodes, i.e. it is not necessary to look if a block-tag was
+ * deleted 2 or 5 siblings away. If we had to actually examine all of those,
+ * nodes, this would get very complex, and it would be much simpler to just
+ * discard the original separators => potentially lots of dirty diffs.
+ *
+ * To understand why it is sufficient (for correctness) to examine just
+ * the immediately adjacent nodes, let us look at an additional example.
+ *
+ * Original WT  : "a<div>b</div>c<div>d</div>e\nf"
+ * Original HTML: "<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>"
+ *
+ * Note how <block> tags and <p> tags interleave in the HTML. This would be
+ * the case always no matter how much inline content showed up between the
+ * block tags in wikitext. If the b-`<div>` was deleted, we don't care
+ * about it, since we still have the d-`<div>` before the P tag that preserves
+ * the correctness of the single "\n" separator. If the d-`<div>` was deleted,
+ * we conservatively ignore the original separator and let normal P-P 
constraints
+ * take care of it. At worst, we might generate a dirty diff in this scenario.
+ */
+SSP.emitSepForNode = function(node) {
+       var again = (node === this.sep.lastSourceNode);
+       var origSepUsable = !again &&
+               this.prevNodeUnmodified && 
!DU.nextToDeletedBlockNodeInWT(this.env, this.prevNode, true) &&
+               this.currNodeUnmodified && 
!DU.nextToDeletedBlockNodeInWT(this.env, node, false);
+
+       var origSep;
+       if (origSepUsable) {
+               if (DU.isElt(this.prevNode) && DU.isElt(node)) {
+                       origSep = this.getOrigSrc(
+                               DU.getDataParsoid(this.prevNode).dsr[1],
+                               DU.getDataParsoid(node).dsr[0]
+                       );
+               } else {
+                       origSep = this.sep.src;
+               }
+       }
+
+       if (origSep !== undefined && WTSUtils.isValidSep(origSep)) {
+               this.emitSep(origSep, node, 'ORIG-SEP:');
+       } else {
+               var sep = this.serializer.buildSep(node);
+               this.emitSep(sep || '', node, 'SEP:');
+       }
+};
+
+/**
+ * Pushes the chunk to the current line
+ */
 SSP.emitChunk = function(res, node) {
        if (!(res instanceof String)) {
                this.env.log('warning', 'Emitting non-string value: ' + 
String(res));
@@ -298,75 +390,11 @@
                res.text = res.text.replace(/\n/g, ' ');
        }
 
-       /* 
--------------------------------------------------------------------------
-        * When block nodes are deleted, the deletion affects whether unmodified
-        * newline separators between a pair of unmodified P tags can be reused.
-        *
-        * Example:
-        * Original WT  : "<div>x</div>foo\nbar"
-        * Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>"
-        * Edited HTML  : "<p>foo</p>\n<p>bar</p>"
-        * Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>"
-        * Expected WT  : "foo\n\nbar"
-        *
-        * Note the additional newline between "foo" and "bar" even though 
originally,
-        * there was just a single newline.
-        *
-        * So, even though the two P tags and the separator between them is
-        * unmodified, it is insufficient to rely on just that. We have to look 
at
-        * what has happened on the two wikitext lines onto which the two P tags
-        * will get serialized.
-        *
-        * Now, if you check the code for 'nextToDeletedBlockNodeInWT', that 
code is
-        * not really looking at ALL the nodes before/after the nodes that could
-        * serialize onto the wikitext lines. It is looking at the immediately
-        * adjacent nodes, i.e. it is not necessary to look if a block-tag was
-        * deleted 2 or 5 siblings away. If we had to actually examine all of 
those,
-        * nodes, this would get very complex, and it would be much simpler to 
just
-        * discard the original separators => potentially lots of dirty diffs.
-        *
-        * To understand why it is sufficient (for correctness) to examine just
-        * the immediately adjacent nodes, let us look at an additional example.
-        *
-        * Original WT  : "a<div>b</div>c<div>d</div>e\nf"
-        * Original HTML: 
"<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>"
-        *
-        * Note how <block> tags and <p> tags interleave in the HTML. This 
would be
-        * the case always no matter how much inline content showed up between 
the
-        * block tags in wikitext. If the b-<div> was deleted, we don't care
-        * about it, since we still have the d-<div> before the P tag that 
preserves
-        * the correctness of the single "\n" separator. If the d-<div> was 
deleted,
-        * we conservatively ignore the original separator and let normal P-P 
constraints
-        * take care of it. At worst, we might generate a dirty diff in this 
scenario.
-        * 
-------------------------------------------------------------------------- */
-       var origSepUsable =
-               this.prevNodeUnmodified && 
!DU.nextToDeletedBlockNodeInWT(this.env, this.sep.lastSourceNode, true) &&
-               this.currNodeUnmodified && 
!DU.nextToDeletedBlockNodeInWT(this.env, node, false);
-
        // Emit separator first
        if (res.noSep) {
                /* skip separators for internal tokens fromSelSer */
        } else {
-               var origSep;
-               if (origSepUsable) {
-                       if (this.prevNode && DU.isElt(this.prevNode) && 
DU.isElt(node)) {
-                               origSep = this.getOrigSrc(
-                                       DU.getDataParsoid(this.prevNode).dsr[1],
-                                       DU.getDataParsoid(node).dsr[0]
-                               );
-                       } else {
-                               origSep = this.sep.src;
-                       }
-               }
-
-               if (origSep !== undefined && WTSUtils.isValidSep(origSep)) {
-                       this.pushSep(origSep, node, 'ORIG-SEP:');
-               } else {
-                       var sep = this.serializer.buildSep(node);
-                       if (sep !== undefined) {
-                               this.pushSep(sep, node, 'SEP:');
-                       }
-               }
+               this.emitSepForNode(node);
        }
 
        if (this.onSOL) {
@@ -452,10 +480,7 @@
        this.env.log(this.serializer.logType, '--->', this.logPrefix, 
function() {
                return JSON.stringify(res instanceof ConstrainedText ? res.text 
: res);
        });
-       this.push(res, node);
-
-       // Update separator state
-       this.updateSep(node);
+       this.pushToCurrLine(res, node);
 
        // Update sol flag. Test for
        // newlines followed by optional includeonly or comments
@@ -477,8 +502,27 @@
 };
 
 /**
- * Serialize children to a string.
- * Does not affect the separator state.
+ * Abstracts some steps taken in `_serializeChildrenToString` and 
`serializeDOM`
+ */
+SSP._kickOffSerialize = function(node, wtEscaper) {
+       this.updateSep(node);
+       this.currNodeUnmodified = false;
+       this.updateModificationFlags(node);
+       this.resetCurrLine(node.firstChild);
+       return this.serializeChildren(node, wtEscaper)
+       .then(function() {
+               // Emit child-parent seps.
+               this.emitSepForNode(node);
+               // We've reached EOF, flush the remaining buffered text.
+               this.flushLine();
+       }.bind(this));
+};
+
+/**
+ * Serialize children to a string
+ *
+ * FIXME(arlorla): Shouldln't affect the separator state, but accidents have
+ * have been known to happen. T109793 suggests using its own wts / state.
  */
 SSP._serializeChildrenToString = Promise.method(function(node, wtEscaper, 
inState) {
        // FIXME: Make sure that the separators emitted here conform to the
@@ -487,23 +531,22 @@
        var oldSOL = this.onSOL;
        var oldOut = this.out;
        var oldStart = this.atStartOfOutput;
-       var oldChunks = this.currLine.chunks;
+       var oldCurrLine = this.currLine;
        var oldLogPrefix = this.logPrefix;
+       // Modification flags
+       var oldPrevNodeUnmodified = this.prevNodeUnmodified;
+       var oldCurrNodeUnmodified = this.currNodeUnmodified;
+       var oldPrevNode = this.prevNode;
 
        this.out = '';
        this.logPrefix = 'OUT(C):';
        this.sep = {};
        this.onSOL = false;
        this.atStartOfOutput = false;
-       this.currLine.chunks = [];
        this[inState] = true;
 
-       return this.serializeChildren(node, wtEscaper).then(function() {
-               // Emit child-parent seps.
-               this.emitChunk('', node);
-               // We've reached EOF, flush the remaining buffered text.
-               this.flushLine();
-
+       return this._kickOffSerialize(node, wtEscaper)
+       .then(function() {
                // restore the state
                var bits = this.out;
                this.out = oldOut;
@@ -511,8 +554,12 @@
                this.sep = oldSep;
                this.onSOL = oldSOL;
                this.atStartOfOutput = oldStart;
-               this.currLine.chunks = oldChunks;
+               this.currLine = oldCurrLine;
                this.logPrefix = oldLogPrefix;
+               // Modification flags
+               this.prevNodeUnmodified = oldPrevNodeUnmodified;
+               this.currNodeUnmodified = oldCurrNodeUnmodified;
+               this.prevNode = oldPrevNode;
                return bits;
        }.bind(this));
 });
diff --git a/lib/html2wt/WikitextSerializer.js 
b/lib/html2wt/WikitextSerializer.js
index eca6a18..d404bee 100644
--- a/lib/html2wt/WikitextSerializer.js
+++ b/lib/html2wt/WikitextSerializer.js
@@ -851,7 +851,7 @@
                // Strip leading newlines and other whitespace
                var match = res.match(this.separatorREs.sepPrefixWithNlsRE);
                if (match) {
-                       state.setSep((state.sep.src || '') + match[0]);
+                       state.appendSep(match[0]);
                        res = res.substring(match[0].length);
                }
        }
@@ -890,8 +890,7 @@
        // Move trailing newlines into the next separator
        if (newSepMatch) {
                if (!state.sep.src) {
-                       state.setSep(newSepMatch[0]);
-                       state.updateSep(node);
+                       state.appendSep(newSepMatch[0]);
                } else {
                        /* SSS FIXME: what are we doing with the stripped NLs?? 
*/
                }
@@ -1089,9 +1088,11 @@
                case node.ELEMENT_NODE:
                        // Ignore DiffMarker metas, but clear unmodified node 
state
                        if (DU.isMarkerMeta(node, "mw:DiffMarker")) {
-                               state.sep.lastSourceNode = node;
-                               // Update modification flags
                                state.updateModificationFlags(node);
+                               // `state.sep.lastSourceNode` is cleared here 
so that removed
+                               // separators between otherwise unmodified 
nodes don't get
+                               // restored.
+                               state.updateSep(node);
                                return node.nextSibling;
                        }
                        domHandler = this._getDOMHandler(node);
@@ -1106,7 +1107,7 @@
                        var text = node.nodeValue;
                        if (!state.inIndentPre &&
                                        
text.match(state.serializer.separatorREs.pureSepRE)) {
-                               state.setSep((state.sep.src || '') + text);
+                               state.appendSep(text);
                                return node.nextSibling;
                        }
                        if (state.selserMode) {
@@ -1125,7 +1126,7 @@
                        break;
                case node.COMMENT_NODE:
                        // Merge this into separators
-                       state.setSep((state.sep.src || '') + 
WTSUtils.commentWT(node.nodeValue));
+                       state.appendSep(WTSUtils.commentWT(node.nodeValue));
                        return node.nextSibling;
                default:
                        console.assert("Unhandled node type:", node.outerHTML);
@@ -1380,15 +1381,8 @@
                DU.dumpDOM(body, 'DOM: post-normal');
        }
 
-       state.updateSep(body);
-       state.resetCurrLine(body.firstChild);
-
-       return state.serializeChildren(body).then(function() {
-               // Emit child-parent seps.
-               state.emitChunk('', body);
-               // We've reached EOF, flush the remaining buffered text.
-               state.flushLine();
-
+       return state._kickOffSerialize(body)
+       .then(function() {
                if (state.hasIndentPreNowikis) {
                        // FIXME: Perhaps this can be done on a per-line basis
                        // rather than do one post-pass on the entire document.
diff --git a/lib/html2wt/separators.js b/lib/html2wt/separators.js
index e6e16b9..0f341a8 100644
--- a/lib/html2wt/separators.js
+++ b/lib/html2wt/separators.js
@@ -483,8 +483,9 @@
         * In other scenarios, DSR values on "adjacent" nodes in the edited DOM
         * may not reflect deleted content between them.
         * 
---------------------------------------------------------------------- */
-       var origSepUsable = node && prevNode && node !== prevNode &&
-               state.selserMode && !state.inModifiedContent &&
+       var again = (node === prevNode);
+       var origSepUsable = !again &&
+                       state.selserMode && !state.inModifiedContent &&
                        !DU.nextToDeletedBlockNodeInWT(state.env, prevNode, 
true) &&
                        !DU.nextToDeletedBlockNodeInWT(state.env, node, false) 
&&
                        WTSUtils.origSrcValidInEditedContext(state.env, 
prevNode) &&
@@ -599,10 +600,6 @@
                                }
                        } else {
                                this.env.log("info/html2wt", "dsr backwards: 
should not happen!");
-                       }
-
-                       if (state.sep.lastSourceSep) {
-                               sep = state.sep.lastSourceSep + sep;
                        }
                }
        }
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index d046a5c..f027649 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -22258,7 +22258,7 @@
 |}
 !! end
 
-# Tests LanguageVariantText._fromSelser
+# Tests LanguageVariantText._fromSelSer
 !! test
 LanguageConverter selser (4)
 !! options

-- 
To view, visit https://gerrit.wikimedia.org/r/380899
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I92953d00580bf8ad7d5e34a0d804931991da1e43
Gerrit-PatchSet: 14
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <abrea...@wikimedia.org>
Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org>
Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org>
Gerrit-Reviewer: Sbailey <sbai...@wikimedia.org>
Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Refactor and document seperator serialization

Reply via email to