jenkins-bot has submitted this change and it was merged. Change subject: Support "standalone mode" for single-article conversion. ......................................................................
Support "standalone mode" for single-article conversion. Change-Id: I1f53b0c811bfcbdf51317dfd622877f26763a4d4 --- M README.md M bin/mw-ocg-texter M lib/index.js A lib/standalone.js M package.json 5 files changed, 83 insertions(+), 8 deletions(-) Approvals: Cscott: Looks good to me, approved jenkins-bot: Verified diff --git a/README.md b/README.md index 3ac0613..521407a 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,16 @@ bin/mw-ocg-texter --help ``` +## Standalone mode +To convert a single article without the bundle creation step, use: +``` +bin/mw-ocg-texter -h en.wikipedia.org -t "United States" +``` +The `-h` option specifies the hostname of the wiki, and the `-t` +option gives the title to convert. The content will be fetched +from RESTBase and converted, with output to standard out (unless +the `-o` option is given). + ## Other ideas This backend should implement the [Unicode Nearly Plain-Text Encoding of Mathematics](http://unicode.org/notes/tn28/UTN28-PlainTextMath-v3.pdf) diff --git a/bin/mw-ocg-texter b/bin/mw-ocg-texter index d1a1de5..b06c0e3 100755 --- a/bin/mw-ocg-texter +++ b/bin/mw-ocg-texter @@ -20,11 +20,16 @@ .option('-D, --debug', 'Turn on debugging features (eg, full stack traces on exceptions)') .option('-T, --temporary-directory <dir>', - 'Use <dir> for temporaries, not $TMPDIR or /tmp', null); + 'Use <dir> for temporaries, not $TMPDIR or /tmp', null) + .option('-h, --domain <domain name>', + 'Wiki hostname to use to resolve the title [en.wikipedia.org]', null) + .option('-t, --title <title>', + 'Don\'t use a bundle, download the given title instead.', + null); program.parse(process.argv); -if (program.args.length === 0) { +if (program.args.length === 0 && !program.title) { console.error('A bundle filename or directory is required.'); return 1; } @@ -73,7 +78,15 @@ log: log }; -texter.convert(options).catch(function(err) { +var p; +if (!program.title) { + p = texter.convert(options); +} else { + options.domain = program.domain || 'en.wikipedia.org'; + options.title = program.title; + p = require('../lib/standalone').convert(options); +} +p.catch(function(err) { var msg = { type: 'log', level: 'error' diff --git a/lib/index.js b/lib/index.js index d01a1fa..9c20729 100644 --- a/lib/index.js +++ b/lib/index.js @@ -992,10 +992,10 @@ format.writeSummary(textEscape(metabook.summary).replace(/\s+/g, ' ')); } - var pdb = new Db( + var pdb = options.pdb || new Db( path.join(builddir, 'bundle', 'parsoid.db'), { readonly: true } ); - var sidb = new Db( + var sidb = options.sidb || new Db( path.join(builddir, 'bundle', 'siteinfo.db'), { readonly: true } ); var write = {}; @@ -1006,7 +1006,9 @@ var document, base = '', articleLanguage; var key = (item.wiki ? (item.wiki+'|') : '') + revid; return pdb.get(key, 'nojson').then(function(data) { - document = domino.createDocument(data); + // avoid redundant parsing in standalone mode (which will + // return an object with a `document` property) + document = data.document || domino.createDocument(data); var baseElem = document.querySelector('head > base[href]'); if (baseElem) { base = baseElem.getAttribute('href'). @@ -1109,5 +1111,6 @@ module.exports = { name: json.name, // package name version: json.version, // version # for this package - convert: convert + convert: convert, + generateOutput: generateOutput // for use by standalone.js }; diff --git a/lib/standalone.js b/lib/standalone.js new file mode 100644 index 0000000..d6482ae --- /dev/null +++ b/lib/standalone.js @@ -0,0 +1,46 @@ +"use strict"; +require('core-js/shim'); +var Promise = require('prfun'); +var main = require('./'); +var bundler = require('mw-ocg-bundler'); + +var convert = module.exports.convert = function(options) { + // make metabook. + return bundler.metabook.fromArticles([{ + prefix: options.prefix, + domain: options.domain, + title: options.title + }], options).then(function(metabook) { + var item = metabook.items[0]; + var Parsoid = new bundler.parsoid( + metabook.wikis, options.apiVersion, options.log + ); + var siteinfo = options.siteinfo || new bundler.siteinfo( + metabook.wikis, options.log + ); + return siteinfo.fetch(item.wiki).then(function(si) { + return Parsoid.fetch(si, item.wiki, item.title, null, 2).then(function(pr) { + var opts = Object.create(options); + item.wiki = pr.wiki; + item.title = pr.title; + item.revision = pr.getRevisionId(); + // fake a db + opts.pdb = { + get: function(key, nojson) { + return Promise.resolve({ document: pr.document }); + } + }; + opts.sidb = { + get: function(key, nojson) { + return Promise.resolve(si); + } + }; + opts.status = { + createStage: function(){}, + report: function(){}, + }; + return main.generateOutput(metabook, '/dont/use/this', opts); + }); + }); + }); +}; diff --git a/package.json b/package.json index 495c559..9115d57 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,7 @@ ], "license": "GPL-2.0", "dependencies": { - "commander": "~2.5.0", + "commander": "~2.8.1", "core-js": "~0.9.1", "domino": "~1.0.17", "linewrap": "~0.2.1", @@ -25,6 +25,9 @@ "sqlite3": "~3.0.8", "tmp": "~0.0.24" }, + "optionalDependencies": { + "mw-ocg-bundler": "wikimedia/mediawiki-extensions-Collection-OfflineContentGenerator-bundler#master" + }, "devDependencies": { "jshint": "~2.6.3", "mocha": "~2.0.1" -- To view, visit https://gerrit.wikimedia.org/r/235124 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I1f53b0c811bfcbdf51317dfd622877f26763a4d4 Gerrit-PatchSet: 2 Gerrit-Project: mediawiki/extensions/Collection/OfflineContentGenerator/text_renderer Gerrit-Branch: master Gerrit-Owner: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: Nikerabbit <niklas.laxst...@gmail.com> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits