GWicke has submitted this change and it was merged.

Change subject: Adding support for capturing raw attribute value strings 
(vsrc), and for an --inputfile option for parse test as an alternative to stdin
......................................................................


Adding support for capturing raw attribute value strings (vsrc), and for an 
--inputfile option for parse test as an alternative to stdin

Amend by gwicke:
- Only escape entities in the serializer when an attribute value is not
  round-tripped from source.
- Comment out a stray debug print

Overall, 10 additonal wt2wt tests are passing with this patch.

Change-Id: Ia26be4af360bd67926170cabe1a76cb20e007475
---
M js/lib/mediawiki.WikitextSerializer.js
M js/lib/pegTokenizer.pegjs.txt
M js/tests/parse.js
3 files changed, 117 insertions(+), 69 deletions(-)

Approvals:
  GWicke: Verified; Looks good to me, approved



diff --git a/js/lib/mediawiki.WikitextSerializer.js 
b/js/lib/mediawiki.WikitextSerializer.js
index 58a6277..c674637 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -1967,7 +1967,7 @@
                        'data-parsoid-serialize': 1
                };
 
-       var kv, k, v, tplKV, tplK, tplV;
+       var kv, k, vInfo, v, tplKV, tplK, tplV;
        for ( var i = 0, l = attribs.length; i < l; i++ ) {
                kv = attribs[i];
                k = kv.k;
@@ -1984,7 +1984,8 @@
                        } else {
                                tplK = tplAttrState.ks[k],
                                tplV = tplAttrState.vs[k],
-                               v    = token.getAttributeShadowInfo(k).value;
+                               vInfo = token.getAttributeShadowInfo(k),
+                               v = vInfo.value;
 
                                // Deal with k/v's that were template-generated
                                if (tplK) {
@@ -1995,8 +1996,10 @@
                                }
 
                                if (v.length ) {
-                                       // Escape HTML entities
-                                       v = Util.escapeEntities(v);
+                                       if (!vInfo.fromsrc) {
+                                               // Escape HTML entities
+                                               v = Util.escapeEntities(v);
+                                       }
                                        out.push(k + '=' + '"' + v.replace( 
/"/g, '&quot;' ) + '"');
                                } else {
                                        out.push(k);
diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt
index be7dbff..eb9f64c 100644
--- a/js/lib/pegTokenizer.pegjs.txt
+++ b/js/lib/pegTokenizer.pegjs.txt
@@ -164,6 +164,18 @@
         }
     }
 
+    /**
+    * Get an attribute value and source, given a start and end position.  
Returned object will have a 'value' property
+    * holding the value (first argument) and a 'valueSrc' property holding the 
raw value source
+    */
+    function get_attribute_value_and_source( attrVal, attrValPosStart, 
attrValPosEnd ) {
+        //console.log([attrVal, attrValPosStart, attrValPosEnd].join(", "));
+        return {
+            value: attrVal,
+            valueSrc: input.substring(attrValPosStart, attrValPosEnd)
+        }
+    }
+
     /* End static utilities */
 
     /*
@@ -1601,13 +1613,15 @@
     namePos0:({return pos;})
     name:generic_attribute_name
     namePos:({return pos;})
-    value:(( space / newline )*
-          v:generic_attribute_newline_value { return v })?
+    valueData:(( space / newline )*
+        v:generic_attribute_newline_value { return v })?
 {
     //console.warn('generic_newline_attribute: ' + pp( name ))
     var res;
-    if ( value !== '' ) {
+    if ( valueData !== '' ) {
+        var value = valueData.value;
         res = new KV( name, value );
+        res.vsrc = valueData.valueSrc;
     } else {
         res = new KV( name, '' );
     }
@@ -1623,15 +1637,17 @@
     namePos0:({return pos;})
     name:generic_attribute_name
     namePos:({return pos;})
-    value:(optionalSpaceToken
+    valueData:(optionalSpaceToken
           v:generic_attribute_value { return v })?
 {
     //console.warn( 'generic attribute: ' + pp([name, value]));
     // FIXME: name might just be a template, which can expand to a key-value
     // pair later. We'll need to handle that in the AttributeTransformManager.
     var res;
-    if ( value !== '' ) {
+    if ( valueData !== '' ) {
+        var value = valueData.value;
         res = new KV( name, value );
+        res.vsrc = valueData.valueSrc;
     } else {
         res = new KV( name, '' );
     }
@@ -1672,23 +1688,37 @@
 
 // Attribute value, quoted variants can span multiple lines.
 xml_att_value
-  = "'" r:( t1:attribute_preprocessor_text_single "'" { return t1; }
+  = "'" r:(valPos1:({return pos;}) t1:attribute_preprocessor_text_single? 
valPos2:({return pos;}) "'"
+            { return get_attribute_value_and_source(t1, valPos1, valPos2); }
         // Missing end quote: accept | and > look-ahead as heuristic
-        / t2:attribute_preprocessor_text_single_broken &[|>] {return t2;} ) { 
return r; }
-  / '"' r:( t1:attribute_preprocessor_text_double '"' { return t1; }
+        / valPos1:({return pos;}) 
t2:attribute_preprocessor_text_single_broken? valPos2:({return pos;}) &[|>]
+            { return get_attribute_value_and_source(t2, valPos1, valPos2); } )
+                { return r; }
+  / '"' r:(valPos1:({return pos;}) t1:attribute_preprocessor_text_double? 
valPos2:({return pos;}) '"'
+            { return get_attribute_value_and_source(t1, valPos1, valPos2); }
         // Missing end quote: accept | and > look-ahead as heuristic
-        / t2:attribute_preprocessor_text_double_broken &[|>] {return t2;} ) { 
return r; }
-  / attribute_preprocessor_text
+        / valPos1:({return pos;}) 
t2:attribute_preprocessor_text_double_broken? valPos2:({return pos;}) &[|>]
+            { return get_attribute_value_and_source(t2, valPos1, valPos2); } )
+                { return r; }
+  / valPos1:({return pos;}) t:attribute_preprocessor_text? valPos2:({return 
pos;})
+        { return get_attribute_value_and_source(t, valPos1, valPos2); }
 
 // Attribute value, restricted to a single line.
 att_value
-  = "'" r:( t1:attribute_preprocessor_text_single_line "'" { return t1; }
+  = "'" r:(valPos1:({return pos;}) t1:attribute_preprocessor_text_single_line? 
valPos2:({return pos;}) "'"
+            { return get_attribute_value_and_source(t1, valPos1, valPos2); }
         // Missing end quote: accept | and > look-ahead as heuristic
-        / t2:attribute_preprocessor_text_single_line_broken &[|>] {return t2;} 
) { return r; }
-  / '"' r:( t1:attribute_preprocessor_text_double_line '"' { return t1; }
+        / valPos1:({return pos;}) 
t2:attribute_preprocessor_text_single_line_broken? valPos2:({return pos;}) &[|>]
+            { return get_attribute_value_and_source(t2, valPos1, valPos2); } )
+                { return r; }
+  / '"' r:(valPos1:({return pos;}) t1:attribute_preprocessor_text_double_line? 
valPos2:({return pos;}) '"'
+            { return get_attribute_value_and_source(t1, valPos1, valPos2); }
         // Missing end quote: accept | and > look-ahead as heuristic
-        / t2:attribute_preprocessor_text_double_line_broken &[|>] {return t2;} 
) { return r; }
-  / attribute_preprocessor_text_line
+        / valPos1:({return pos;}) 
t2:attribute_preprocessor_text_double_line_broken? valPos2:({return pos;}) &[|>]
+            { return get_attribute_value_and_source(t2, valPos1, valPos2); } )
+                { return r; }
+  / valPos1:({return pos;}) t:attribute_preprocessor_text_line? 
valPos2:({return pos;})
+        { return get_attribute_value_and_source(t, valPos1, valPos2); }
 
 /*
  * A variant of generic_tag, but also checks if the tag name is a block-level
diff --git a/js/tests/parse.js b/js/tests/parse.js
index d8ee011..105b8bc 100644
--- a/js/tests/parse.js
+++ b/js/tests/parse.js
@@ -144,7 +144,12 @@
                        description: 'File containing the old page text for a 
selective-serialization operation (see --selser)',
                        'boolean': false,
                        'default': false
-               }
+               },
+        'inputfile': {
+            description: 'File containing input as an alternative to stdin',
+            'boolean': false,
+            'default': false
+        }
        });
 
        var argv = opts.argv;
@@ -215,58 +220,68 @@
                        stdout = process.stdout,
                        inputChunks = [];
 
-               // collect input
-               stdin.resume();
-               stdin.setEncoding('utf8');
-               stdin.on( 'data', function( chunk ) {
-                       inputChunks.push( chunk );
-               } );
-
                // process input
-               stdin.on( 'end', function() {
-                       var input = inputChunks.join('');
-                       if (argv.html2wt || argv.html2html) {
-                               var doc = Util.parseHTML('<html><body>' + 
input.replace(/\r/g, '') + '</body></html>'),
-                                       wt = '';
+        var processInput = function() {
+            var input = inputChunks.join('');
+            if (argv.html2wt || argv.html2html) {
+                var doc = Util.parseHTML('<html><body>' + input.replace(/\r/g, 
'') + '</body></html>'),
+                    wt = '';
 
-                               serializer.serializeDOM( doc.body, function ( 
chunk ) {
-                                       wt += chunk;
-                               }, function () {
-                                       env.page.src = wt;
-                                       if (argv.html2wt) {
-                                               // add a trailing newline for 
shell user's benefit
-                                               stdout.write(wt);
-                                       } else {
-                                               parserPipeline.on('document', 
function(document) {
-                                                       stdout.write( 
Util.serializeNode(document.body) );
-                                               });
-                                               parserPipeline.process(wt);
-                                       }
+                serializer.serializeDOM( doc.body, function ( chunk ) {
+                    wt += chunk;
+                }, function () {
+                    env.page.src = wt;
+                    if (argv.html2wt) {
+// add a trailing newline for shell user's benefit
+                        stdout.write(wt);
+                    } else {
+                        parserPipeline.on('document', function(document) {
+                            stdout.write( Util.serializeNode(document.body) );
+                        });
+                        parserPipeline.process(wt);
+                    }
 
-                               } );
-                       } else {
-                               parserPipeline.on('document', function ( 
document ) {
-                                       var res, finishCb = function 
(trailingNL) {
-                                               stdout.write( res );
-                                               if (trailingNL) {
-                                                       stdout.write("\n");
-                                               }
-                                       };
-                                       if (argv.wt2html) {
-                                               res = 
Util.serializeNode(document.body);
-                                               finishCb(true);
-                                       } else {
-                                               res = '';
-                                               serializer.serializeDOM( 
document.body, function ( chunk ) {
-                                                       res += chunk;
-                                               }, finishCb );
-                                       }
-                               });
+                } );
+            } else {
+                parserPipeline.on('document', function ( document ) {
+                    var res, finishCb = function (trailingNL) {
+                        stdout.write( res );
+                        if (trailingNL) {
+                            stdout.write("\n");
+                        }
+                    };
+                    if (argv.wt2html) {
+                        res = Util.serializeNode(document.body);
+                        finishCb(true);
+                    } else {
+                        res = '';
+                        serializer.serializeDOM( document.body, function ( 
chunk ) {
+                            res += chunk;
+                        }, finishCb );
+                    }
+                });
 
-                               // Kick off the pipeline by feeding the input 
into the parser pipeline
-                               env.page.src = input;
-                               parserPipeline.process( input );
-                       }
-               } );
+// Kick off the pipeline by feeding the input into the parser pipeline
+                env.page.src = input;
+                parserPipeline.process( input );
+            }
+        };
+
+
+        if (argv.inputfile) {
+            //read input from the file, then process
+            var fileContents = fs.readFileSync(argv.inputfile, 'utf8');
+            inputChunks.push(fileContents);
+            processInput();
+        }
+        else {
+            // collect input
+            stdin.resume();
+            stdin.setEncoding('utf8');
+            stdin.on( 'data', function( chunk ) {
+                inputChunks.push( chunk );
+            } );
+            stdin.on( 'end', processInput );
+        }
        } );
 } )();

-- 
To view, visit https://gerrit.wikimedia.org/r/54491
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ia26be4af360bd67926170cabe1a76cb20e007475
Gerrit-PatchSet: 4
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: GWicke <gwi...@wikimedia.org>
Gerrit-Reviewer: GWicke <gwi...@wikimedia.org>
Gerrit-Reviewer: Jeff evans <jeffrey.wayne.evans+wikime...@gmail.com>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to