Arlolra has uploaded a new change for review. https://gerrit.wikimedia.org/r/180642
Change subject: Remove dumpGrepper files ...................................................................... Remove dumpGrepper files * Adds a dev dependency on dumpgrepper. * And a script so you don't have to go fishing in node_modules/.bin to run it. More useful in npm v2.x where you can, npm run dumpgrepper -- --help Change-Id: If21dfcf0575b15776e388e5220d1b6cb811be2f6 --- M package.json D tests/dumpGrepPatterns/martian-endtags.sh D tests/dumpGrepPatterns/misc.txt D tests/dumpGrepper.js D tests/dumpReader.js 5 files changed, 4 insertions(+), 299 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid refs/changes/42/180642/1 diff --git a/package.json b/package.json index 5cb9bc7..569ddfa 100644 --- a/package.json +++ b/package.json @@ -26,7 +26,8 @@ "chai": "~1.9.1", "colors": "~0.6.2", "mocha": "~1.21.4", - "supertest": "0.14.0" + "supertest": "~0.14.0", + "dumpgrepper": "~0.1.0" }, "main": "lib/index.js", "bin": { @@ -36,7 +37,8 @@ "start": "node api/server.js", "mocha": "mocha --opts tests/mocha/mocha.opts tests/mocha", "parserTests": "node tests/parserTests.js --wt2html --wt2wt --html2wt --html2html --selser --no-color --quiet --blacklist", - "test": "npm run parserTests && npm run mocha" + "test": "npm run parserTests && npm run mocha", + "dumpgrepper": "dumpgrepper" }, "repository": { "type": "git", diff --git a/tests/dumpGrepPatterns/martian-endtags.sh b/tests/dumpGrepPatterns/martian-endtags.sh deleted file mode 100755 index b0395cf..0000000 --- a/tests/dumpGrepPatterns/martian-endtags.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -# extension tag hooks enabled at en.wikipedia.org -exts="categorytree|charinsert|gallery|hiero|imagemap|inputbox|math|nowiki|poem|pre|ref|references|source|syntaxhighlight|timeline" - -wiki="nowiki|includeonly|noinclude|onlyinclude" - -# just the html5 elements -html5s="a|abbr|address|area|article|aside|audio|b|base|bdi|bdo|blockquote|body|br|button|canvas|caption|cite|code|col|colgroup|command|data|datalist|dd|del|details|dfn|div|dl|dt|em|embed|fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|head|header|hgroup|hr|html|i|iframe|img|input|ins|kbd|keygen|label|legend|li|link|map|mark|menu|meta|meter|nav|noscript|object|ol|optgroup|option|output|p|param|pre|progress|q|rp|rt|rtc|ruby|s|samp|script|section|select|small|source|span|strong|style|sub|summary|sup|table|tbody|td|textarea|tfoot|th|thead|time|title|tr|track|u|ul|var|video|wbr" - -htmlold="center|font|tt" - -normaltags="$exts|$wiki|$html5s|$htmlold" - -#regexp="<(?!\/|$exts|$htmls)[^>]*>.*?<!--([^<]+|<(\/|$exts|$htmls)[^>]*>)*<\/(?!$exts|$htmls)[^>]*>" -#regexp="<(?!/|$normaltags)[^&]+>[^&]+<!--[^&-]*</(?!$normaltags)((?!>).)+>" -regexp="</(?=[a-z])(?!$normaltags)[^>]+>" -#regexp="<(?!\/|$exts|$htmls)[^>]*>" - -#echo $regexp - -if [ -z "$1" ];then - echo "Usage: $0 <xmldump.gz>" - exit 1 -fi - -zcat $1 | node ../dumpGrepper.js -i "$regexp" diff --git a/tests/dumpGrepPatterns/misc.txt b/tests/dumpGrepPatterns/misc.txt deleted file mode 100644 index cbbcc7f..0000000 --- a/tests/dumpGrepPatterns/misc.txt +++ /dev/null @@ -1,18 +0,0 @@ -# A collection of misc interesting regexps - -# ISBN links with at least one line break (https://bugzilla.wikimedia.org/show_bug.cgi?id=29025) -(?:(?:RFC|PMID)[ \t\n\r\f]*[\n\f\r]+[ \t\n\r\f]*([0-9]+)|ISBN[ \t\n\r\f]*[\n\f\r]+[ \t\n\r\f]*(\b(?:97[89][ -]?)?(?:[0-9][ -]?){9}[0-9Xx]\b)) - -# ISBN links with at least two line breaks (https://bugzilla.wikimedia.org/show_bug.cgi?id=29025) -(?:(?:RFC|PMID)[ \t\n\r\f]*(?:[\n\f\r][ \t\n\r\f]*){2,}([0-9]+)|ISBN[ \t\n\r\f]*(?:[\n\f\r][ \t\n\r\f]*){2,}(\b(?:97[89][ -]?)?(?:[0-9][ -]?){9}[0-9Xx]\b)) - -# Template:Table_cell_templates in enwiki -{{\s*(?:rh|rh2|yes|Ya|no|Na|coming soon|bad|eliminated|Site active|Site inactive|good|yes2|won|no2|nom|sho|TBA|partial|yes-No|okay|some|any|n/a|BLACK|dunno|Unknown|Depends|Included|dropped|terminated|beta|table-experimental|free|nonfree|proprietary|needs|incorrect|no result|pending|nightly|release-candidate|[?]|unofficial|usually|rarely|sometimes|draw)\s*(?:[|]|}}) - -# cases which aren't the simple '| {{yes}}' case. -[^ \t|]\s*{{\s*(?:rh|rh2|yes|Ya|no|Na|coming soon|bad|eliminated|Site active|Site inactive|good|yes2|won|no2|nom|sho|TBA|partial|yes-No|okay|some|any|n/a|BLACK|dunno|Unknown|Depends|Included|dropped|terminated|beta|table-experimental|free|nonfree|proprietary|needs|incorrect|no result|pending|nightly|release-candidate|[?]|unofficial|usually|rarely|sometimes|draw)\s*(?:[|]|}}) - -# blank lines with more than one comment (bug 41756) -^([ ]*<!--((?!-->).)*-->){2,}[ ]*$ (use with -m option) -# more precise version, avoid those surrounded by newlines -[^\n]\n([ ]*<!--((?!-->).)*-->){2,}[ ]*\n(?!\n) diff --git a/tests/dumpGrepper.js b/tests/dumpGrepper.js deleted file mode 100755 index 546087b..0000000 --- a/tests/dumpGrepper.js +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env node -/** - * A simple dump grepper based on the DumpReader module. - */ -"use strict"; -require( '../lib/core-upgrade.js' ); - -var dumpReader = require('./dumpReader.js'), - events = require('events'), - util = require('util'), - yargs = require('yargs'), - Util = require( '../lib/mediawiki.Util.js' ).Util; - -function DumpGrepper ( regexp ) { - // inherit from EventEmitter - events.EventEmitter.call(this); - this.re = regexp; -} - -util.inherits(DumpGrepper, events.EventEmitter); - -DumpGrepper.prototype.grepRev = function ( revision, onlyFirst ) { - var result = this.re.exec( revision.text ), - matches = []; - while ( result ) { - matches.push( result ); - if ( onlyFirst ) { break; } - result = this.re.exec( revision.text ); - } - if ( matches.length ) { - this.emit( 'match', revision, matches ); - } -}; - -module.exports.DumpGrepper = DumpGrepper; - -if (module === require.main) { - var opts = yargs.usage( 'Usage: zcat dump.xml.gz | $0 <regexp>', { - 'i': { - description: 'Case-insensitive matching', - 'boolean': true, - 'default': false - }, - 'm': { - description: 'Treat ^ and $ as matching beginning/end of *each* line, instead of beginning/end of entire article', - 'boolean': true, - 'default': false - }, - 'color': { - description: 'Highlight matched substring using color. Use --no-color to disable. Default is "auto".', - 'default': 'auto' - }, - 'l': { - description: 'Suppress normal output; instead print the name of each article from which output would normally have been printed.', - 'boolean': true, - 'default': false - } - } ); - var argv = opts.argv; - - if( argv.help ) { - opts.showHelp(); - process.exit( 0 ); - } - Util.setColorFlags( argv ); - - var flags = 'g'; - if( Util.booleanOption( argv.i ) ) { - flags += 'i'; - } - if( Util.booleanOption( argv.m ) ) { - flags += 'm'; - } - - var re = new RegExp( argv._[0], flags ); - var onlyFirst = Util.booleanOption( argv.l ); - - var reader = new dumpReader.DumpReader(), - grepper = new DumpGrepper( re ), - stats = { - revisions: 0, - matches: 0 - }; - - reader.on( 'revision', function ( revision ) { - stats.revisions++; - grepper.grepRev( revision, onlyFirst ); - } ); - - grepper.on( 'match', function ( revision, matches ) { - stats.matches++; - if ( Util.booleanOption( argv.l ) ) { - console.log( revision.page.title ); - return; - } - for ( var i = 0, l = matches.length; i < l; i++ ) { - console.log( '== Match: [[' + revision.page.title + ']] ==' ); - var m = matches[i]; - //console.warn( JSON.stringify( m.index, null, 2 ) ); - console.log( - revision.text.substr( m.index - 40, 40 ) + - m[0].green + - revision.text.substr( m.index + m[0].length, 40 ) ); - } - } ); - - process.stdin.on ( 'end' , function() { - // Print some stats - console.warn( '################################################' ); - console.warn( 'Total revisions: ' + stats.revisions ); - console.warn( 'Total matches: ' + stats.matches ); - console.warn( 'Ratio: ' + (stats.matches / stats.revisions * 100) + '%' ); - console.warn( '################################################' ); - } ); - - process.stdin.on('data', reader.push.bind(reader) ); - process.stdin.setEncoding('utf8'); - process.stdin.resume(); - - -} - diff --git a/tests/dumpReader.js b/tests/dumpReader.js deleted file mode 100644 index 75185f7..0000000 --- a/tests/dumpReader.js +++ /dev/null @@ -1,130 +0,0 @@ -"use strict"; - -var events = require('events'), - util = require('util'), - libxml = require('libxmljs'); // npm install libxmljs - -function DumpReader() { - events.EventEmitter.call(this); - this.makeParser(); -} - -util.inherits(DumpReader, events.EventEmitter); - -/** - * @param {Stream} stream input stream to read XML from - */ -DumpReader.prototype.makeParser = function() { - - var self = this, - stack = [{}], - workspace = {}, - buffer = ''; - - function flip(arr) { - var obj = {}; - arr.forEach(function(val) { - obj[val] = true; - }); - return obj; - } - var textNodes = flip(['id', 'text', 'title', 'minor', 'comment', 'username', 'timestamp']), - boolNodes = flip(['minor', 'redirect']), - ignoreNodes = flip(['mediawiki', 'siteinfo', 'upload', 'thread'] ), - parser = new libxml.SaxPushParser(); - this.parser = parser; - parser.on('startElementNS', function(elem, attrs, prefix, uri, namespaces) { - //console.warn( 'elem: ' + elem ); - if (elem in ignoreNodes) { - /* jshint noempty: false */ // we know this is empty! - // ... - } else if (elem === 'page') { - //console.warn( 'starting page' ); - stack = []; - workspace = {}; - } else if (elem === 'revision') { - stack.push(workspace); - workspace = { - page: workspace - }; - } else if (elem in textNodes || elem in boolNodes) { - buffer = ''; - } else { - stack.push(workspace); - workspace = {}; - } - }); - - parser.on( 'endElementNS', function(elem, prefix, uri) { - // ping something! - if (elem === 'mediawiki') { - self.complete = true; - //stream.pause(); - self.emit('end', {}); - } else if (elem === 'page') { - self.emit('page', workspace); - workspace = stack.pop(); - } else if (elem === 'revision') { - self.emit('revision', workspace); - workspace = stack.pop(); - } else if (elem in textNodes) { - workspace[elem] = buffer; - } else if (elem in boolNodes) { - workspace[elem] = true; - } else { - var current = workspace; - workspace = stack.pop(); - workspace[elem] = current; - } - }); - - parser.on( 'characters', function(chars) { - buffer += chars; - }); - parser.on( 'cdata', function(cdata) { - buffer += cdata; - }); - parser.on( 'endDocument', function() { - // This doesn't seem to run...? - self.complete = true; - //stream.pause(); - self.emit('end', {}); - }); - parser.on( 'error', function(err) { - self.emit('error', err); - // Should we.... stop reading now or what? - }); - -}; - -DumpReader.prototype.push = function( chunk ) { - //console.log( 'dr read' + chunk ); - this.parser.push( chunk ); -}; - - -module.exports.DumpReader = DumpReader; - -if (module === require.main) { - var reader = new DumpReader(); - reader.on('end', function() { - console.log('done!'); - process.exit(); - }); - reader.on('error', function(err) { - console.log('error!', err); - process.exit(1); - }); - reader.on('page', function(page) { - console.log('page', page); - }); - reader.on('revision', function(revision) { - revision.text = revision.text.substr(0, 40); - console.log('revision', revision); - }); - console.log('Reading!'); - process.stdin.setEncoding('utf8'); - - process.stdin.on('data', reader.push.bind(reader) ); - process.stdin.resume(); -} -- To view, visit https://gerrit.wikimedia.org/r/180642 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: If21dfcf0575b15776e388e5220d1b6cb811be2f6 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Arlolra <abrea...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits