GWicke has submitted this change and it was merged. Change subject: Adding support for capturing raw attribute value strings (vsrc), and for an --inputfile option for parse test as an alternative to stdin ......................................................................
Adding support for capturing raw attribute value strings (vsrc), and for an --inputfile option for parse test as an alternative to stdin Amend by gwicke: - Only escape entities in the serializer when an attribute value is not round-tripped from source. - Comment out a stray debug print Overall, 10 additonal wt2wt tests are passing with this patch. Change-Id: Ia26be4af360bd67926170cabe1a76cb20e007475 --- M js/lib/mediawiki.WikitextSerializer.js M js/lib/pegTokenizer.pegjs.txt M js/tests/parse.js 3 files changed, 117 insertions(+), 69 deletions(-) Approvals: GWicke: Verified; Looks good to me, approved diff --git a/js/lib/mediawiki.WikitextSerializer.js b/js/lib/mediawiki.WikitextSerializer.js index 58a6277..c674637 100644 --- a/js/lib/mediawiki.WikitextSerializer.js +++ b/js/lib/mediawiki.WikitextSerializer.js @@ -1967,7 +1967,7 @@ 'data-parsoid-serialize': 1 }; - var kv, k, v, tplKV, tplK, tplV; + var kv, k, vInfo, v, tplKV, tplK, tplV; for ( var i = 0, l = attribs.length; i < l; i++ ) { kv = attribs[i]; k = kv.k; @@ -1984,7 +1984,8 @@ } else { tplK = tplAttrState.ks[k], tplV = tplAttrState.vs[k], - v = token.getAttributeShadowInfo(k).value; + vInfo = token.getAttributeShadowInfo(k), + v = vInfo.value; // Deal with k/v's that were template-generated if (tplK) { @@ -1995,8 +1996,10 @@ } if (v.length ) { - // Escape HTML entities - v = Util.escapeEntities(v); + if (!vInfo.fromsrc) { + // Escape HTML entities + v = Util.escapeEntities(v); + } out.push(k + '=' + '"' + v.replace( /"/g, '"' ) + '"'); } else { out.push(k); diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt index be7dbff..eb9f64c 100644 --- a/js/lib/pegTokenizer.pegjs.txt +++ b/js/lib/pegTokenizer.pegjs.txt @@ -164,6 +164,18 @@ } } + /** + * Get an attribute value and source, given a start and end position. Returned object will have a 'value' property + * holding the value (first argument) and a 'valueSrc' property holding the raw value source + */ + function get_attribute_value_and_source( attrVal, attrValPosStart, attrValPosEnd ) { + //console.log([attrVal, attrValPosStart, attrValPosEnd].join(", ")); + return { + value: attrVal, + valueSrc: input.substring(attrValPosStart, attrValPosEnd) + } + } + /* End static utilities */ /* @@ -1601,13 +1613,15 @@ namePos0:({return pos;}) name:generic_attribute_name namePos:({return pos;}) - value:(( space / newline )* - v:generic_attribute_newline_value { return v })? + valueData:(( space / newline )* + v:generic_attribute_newline_value { return v })? { //console.warn('generic_newline_attribute: ' + pp( name )) var res; - if ( value !== '' ) { + if ( valueData !== '' ) { + var value = valueData.value; res = new KV( name, value ); + res.vsrc = valueData.valueSrc; } else { res = new KV( name, '' ); } @@ -1623,15 +1637,17 @@ namePos0:({return pos;}) name:generic_attribute_name namePos:({return pos;}) - value:(optionalSpaceToken + valueData:(optionalSpaceToken v:generic_attribute_value { return v })? { //console.warn( 'generic attribute: ' + pp([name, value])); // FIXME: name might just be a template, which can expand to a key-value // pair later. We'll need to handle that in the AttributeTransformManager. var res; - if ( value !== '' ) { + if ( valueData !== '' ) { + var value = valueData.value; res = new KV( name, value ); + res.vsrc = valueData.valueSrc; } else { res = new KV( name, '' ); } @@ -1672,23 +1688,37 @@ // Attribute value, quoted variants can span multiple lines. xml_att_value - = "'" r:( t1:attribute_preprocessor_text_single "'" { return t1; } + = "'" r:(valPos1:({return pos;}) t1:attribute_preprocessor_text_single? valPos2:({return pos;}) "'" + { return get_attribute_value_and_source(t1, valPos1, valPos2); } // Missing end quote: accept | and > look-ahead as heuristic - / t2:attribute_preprocessor_text_single_broken &[|>] {return t2;} ) { return r; } - / '"' r:( t1:attribute_preprocessor_text_double '"' { return t1; } + / valPos1:({return pos;}) t2:attribute_preprocessor_text_single_broken? valPos2:({return pos;}) &[|>] + { return get_attribute_value_and_source(t2, valPos1, valPos2); } ) + { return r; } + / '"' r:(valPos1:({return pos;}) t1:attribute_preprocessor_text_double? valPos2:({return pos;}) '"' + { return get_attribute_value_and_source(t1, valPos1, valPos2); } // Missing end quote: accept | and > look-ahead as heuristic - / t2:attribute_preprocessor_text_double_broken &[|>] {return t2;} ) { return r; } - / attribute_preprocessor_text + / valPos1:({return pos;}) t2:attribute_preprocessor_text_double_broken? valPos2:({return pos;}) &[|>] + { return get_attribute_value_and_source(t2, valPos1, valPos2); } ) + { return r; } + / valPos1:({return pos;}) t:attribute_preprocessor_text? valPos2:({return pos;}) + { return get_attribute_value_and_source(t, valPos1, valPos2); } // Attribute value, restricted to a single line. att_value - = "'" r:( t1:attribute_preprocessor_text_single_line "'" { return t1; } + = "'" r:(valPos1:({return pos;}) t1:attribute_preprocessor_text_single_line? valPos2:({return pos;}) "'" + { return get_attribute_value_and_source(t1, valPos1, valPos2); } // Missing end quote: accept | and > look-ahead as heuristic - / t2:attribute_preprocessor_text_single_line_broken &[|>] {return t2;} ) { return r; } - / '"' r:( t1:attribute_preprocessor_text_double_line '"' { return t1; } + / valPos1:({return pos;}) t2:attribute_preprocessor_text_single_line_broken? valPos2:({return pos;}) &[|>] + { return get_attribute_value_and_source(t2, valPos1, valPos2); } ) + { return r; } + / '"' r:(valPos1:({return pos;}) t1:attribute_preprocessor_text_double_line? valPos2:({return pos;}) '"' + { return get_attribute_value_and_source(t1, valPos1, valPos2); } // Missing end quote: accept | and > look-ahead as heuristic - / t2:attribute_preprocessor_text_double_line_broken &[|>] {return t2;} ) { return r; } - / attribute_preprocessor_text_line + / valPos1:({return pos;}) t2:attribute_preprocessor_text_double_line_broken? valPos2:({return pos;}) &[|>] + { return get_attribute_value_and_source(t2, valPos1, valPos2); } ) + { return r; } + / valPos1:({return pos;}) t:attribute_preprocessor_text_line? valPos2:({return pos;}) + { return get_attribute_value_and_source(t, valPos1, valPos2); } /* * A variant of generic_tag, but also checks if the tag name is a block-level diff --git a/js/tests/parse.js b/js/tests/parse.js index d8ee011..105b8bc 100644 --- a/js/tests/parse.js +++ b/js/tests/parse.js @@ -144,7 +144,12 @@ description: 'File containing the old page text for a selective-serialization operation (see --selser)', 'boolean': false, 'default': false - } + }, + 'inputfile': { + description: 'File containing input as an alternative to stdin', + 'boolean': false, + 'default': false + } }); var argv = opts.argv; @@ -215,58 +220,68 @@ stdout = process.stdout, inputChunks = []; - // collect input - stdin.resume(); - stdin.setEncoding('utf8'); - stdin.on( 'data', function( chunk ) { - inputChunks.push( chunk ); - } ); - // process input - stdin.on( 'end', function() { - var input = inputChunks.join(''); - if (argv.html2wt || argv.html2html) { - var doc = Util.parseHTML('<html><body>' + input.replace(/\r/g, '') + '</body></html>'), - wt = ''; + var processInput = function() { + var input = inputChunks.join(''); + if (argv.html2wt || argv.html2html) { + var doc = Util.parseHTML('<html><body>' + input.replace(/\r/g, '') + '</body></html>'), + wt = ''; - serializer.serializeDOM( doc.body, function ( chunk ) { - wt += chunk; - }, function () { - env.page.src = wt; - if (argv.html2wt) { - // add a trailing newline for shell user's benefit - stdout.write(wt); - } else { - parserPipeline.on('document', function(document) { - stdout.write( Util.serializeNode(document.body) ); - }); - parserPipeline.process(wt); - } + serializer.serializeDOM( doc.body, function ( chunk ) { + wt += chunk; + }, function () { + env.page.src = wt; + if (argv.html2wt) { +// add a trailing newline for shell user's benefit + stdout.write(wt); + } else { + parserPipeline.on('document', function(document) { + stdout.write( Util.serializeNode(document.body) ); + }); + parserPipeline.process(wt); + } - } ); - } else { - parserPipeline.on('document', function ( document ) { - var res, finishCb = function (trailingNL) { - stdout.write( res ); - if (trailingNL) { - stdout.write("\n"); - } - }; - if (argv.wt2html) { - res = Util.serializeNode(document.body); - finishCb(true); - } else { - res = ''; - serializer.serializeDOM( document.body, function ( chunk ) { - res += chunk; - }, finishCb ); - } - }); + } ); + } else { + parserPipeline.on('document', function ( document ) { + var res, finishCb = function (trailingNL) { + stdout.write( res ); + if (trailingNL) { + stdout.write("\n"); + } + }; + if (argv.wt2html) { + res = Util.serializeNode(document.body); + finishCb(true); + } else { + res = ''; + serializer.serializeDOM( document.body, function ( chunk ) { + res += chunk; + }, finishCb ); + } + }); - // Kick off the pipeline by feeding the input into the parser pipeline - env.page.src = input; - parserPipeline.process( input ); - } - } ); +// Kick off the pipeline by feeding the input into the parser pipeline + env.page.src = input; + parserPipeline.process( input ); + } + }; + + + if (argv.inputfile) { + //read input from the file, then process + var fileContents = fs.readFileSync(argv.inputfile, 'utf8'); + inputChunks.push(fileContents); + processInput(); + } + else { + // collect input + stdin.resume(); + stdin.setEncoding('utf8'); + stdin.on( 'data', function( chunk ) { + inputChunks.push( chunk ); + } ); + stdin.on( 'end', processInput ); + } } ); } )(); -- To view, visit https://gerrit.wikimedia.org/r/54491 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ia26be4af360bd67926170cabe1a76cb20e007475 Gerrit-PatchSet: 4 Gerrit-Project: mediawiki/extensions/Parsoid Gerrit-Branch: master Gerrit-Owner: GWicke <gwi...@wikimedia.org> Gerrit-Reviewer: GWicke <gwi...@wikimedia.org> Gerrit-Reviewer: Jeff evans <jeffrey.wayne.evans+wikime...@gmail.com> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits