[MediaWiki-commits] [Gerrit] Support "standalone mode" for single-article conversion. - change (mediawiki...text_renderer)

2015-09-14 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged.

Change subject: Support "standalone mode" for single-article conversion.
..


Support "standalone mode" for single-article conversion.

Change-Id: I1f53b0c811bfcbdf51317dfd622877f26763a4d4
---
M README.md
M bin/mw-ocg-texter
M lib/index.js
A lib/standalone.js
M package.json
5 files changed, 83 insertions(+), 8 deletions(-)

Approvals:
  Cscott: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/README.md b/README.md
index 3ac0613..521407a 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,16 @@
 bin/mw-ocg-texter --help
 ```
 
+## Standalone mode
+To convert a single article without the bundle creation step, use:
+```
+bin/mw-ocg-texter -h en.wikipedia.org -t "United States"
+```
+The `-h` option specifies the hostname of the wiki, and the `-t`
+option gives the title to convert.  The content will be fetched
+from RESTBase and converted, with output to standard out (unless
+the `-o` option is given).
+
 ## Other ideas
 This backend should implement the [Unicode Nearly Plain-Text Encoding of
 Mathematics](http://unicode.org/notes/tn28/UTN28-PlainTextMath-v3.pdf)
diff --git a/bin/mw-ocg-texter b/bin/mw-ocg-texter
index d1a1de5..b06c0e3 100755
--- a/bin/mw-ocg-texter
+++ b/bin/mw-ocg-texter
@@ -20,11 +20,16 @@
.option('-D, --debug',
'Turn on debugging features (eg, full stack traces on 
exceptions)')
.option('-T, --temporary-directory ',
-   'Use  for temporaries, not $TMPDIR or /tmp', null);
+   'Use  for temporaries, not $TMPDIR or /tmp', null)
+   .option('-h, --domain ',
+   'Wiki hostname to use to resolve the title 
[en.wikipedia.org]', null)
+   .option('-t, --title ',
+   'Don\'t use a bundle, download the given title 
instead.',
+   null);
 
 program.parse(process.argv);
 
-if (program.args.length === 0) {
+if (program.args.length === 0 && !program.title) {
console.error('A bundle filename or directory is required.');
return 1;
 }
@@ -73,7 +78,15 @@
log: log
 };
 
-texter.convert(options).catch(function(err) {
+var p;
+if (!program.title) {
+   p = texter.convert(options);
+} else {
+   options.domain = program.domain || 'en.wikipedia.org';
+   options.title = program.title;
+   p = require('../lib/standalone').convert(options);
+}
+p.catch(function(err) {
var msg = {
type: 'log',
level: 'error'
diff --git a/lib/index.js b/lib/index.js
index d01a1fa..9c20729 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -992,10 +992,10 @@

format.writeSummary(textEscape(metabook.summary).replace(/\s+/g, ' '));
}
 
-   var pdb = new Db(
+   var pdb = options.pdb || new Db(
path.join(builddir, 'bundle', 'parsoid.db'), { readonly: true }
);
-   var sidb = new Db(
+   var sidb = options.sidb || new Db(
path.join(builddir, 'bundle', 'siteinfo.db'), { readonly: true }
);
var write = {};
@@ -1006,7 +1006,9 @@
var document, base = '', articleLanguage;
var key = (item.wiki ? (item.wiki+'|') : '') + revid;
return pdb.get(key, 'nojson').then(function(data) {
-   document = domino.createDocument(data);
+   // avoid redundant parsing in standalone mode (which 
will
+   // return an object with a `document` property)
+   document = data.document || domino.createDocument(data);
var baseElem = document.querySelector('head > 
base[href]');
if (baseElem) {
base = baseElem.getAttribute('href').
@@ -1109,5 +,6 @@
 module.exports = {
name: json.name, // package name
version: json.version, // version # for this package
-   convert: convert
+   convert: convert,
+   generateOutput: generateOutput // for use by standalone.js
 };
diff --git a/lib/standalone.js b/lib/standalone.js
new file mode 100644
index 000..d6482ae
--- /dev/null
+++ b/lib/standalone.js
@@ -0,0 +1,46 @@
+"use strict";
+require('core-js/shim');
+var Promise = require('prfun');
+var main = require('./');
+var bundler = require('mw-ocg-bundler');
+
+var convert = module.exports.convert = function(options) {
+   // make metabook.
+   return bundler.metabook.fromArticles([{
+   prefix: options.prefix,
+   domain: options.domain,
+   title: options.title
+   }], options).then(function(metabook) {
+   var item = metabook.items[0];
+   var Parsoid = new bundler.parsoid(
+   metabook.wikis, options.apiVersion, options.log
+   );
+   var siteinfo = options.siteinfo || 

[MediaWiki-commits] [Gerrit] Support "standalone mode" for single-article conversion. - change (mediawiki...text_renderer)

2015-08-31 Thread Cscott (Code Review)
Cscott has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/235124

Change subject: Support "standalone mode" for single-article conversion.
..

Support "standalone mode" for single-article conversion.

Change-Id: I1f53b0c811bfcbdf51317dfd622877f26763a4d4
---
M bin/mw-ocg-texter
M lib/index.js
A lib/standalone.js
M package.json
4 files changed, 73 insertions(+), 8 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Collection/OfflineContentGenerator/text_renderer
 refs/changes/24/235124/1

diff --git a/bin/mw-ocg-texter b/bin/mw-ocg-texter
index d1a1de5..b06c0e3 100755
--- a/bin/mw-ocg-texter
+++ b/bin/mw-ocg-texter
@@ -20,11 +20,16 @@
.option('-D, --debug',
'Turn on debugging features (eg, full stack traces on 
exceptions)')
.option('-T, --temporary-directory ',
-   'Use  for temporaries, not $TMPDIR or /tmp', null);
+   'Use  for temporaries, not $TMPDIR or /tmp', null)
+   .option('-h, --domain ',
+   'Wiki hostname to use to resolve the title 
[en.wikipedia.org]', null)
+   .option('-t, --title ',
+   'Don\'t use a bundle, download the given title 
instead.',
+   null);
 
 program.parse(process.argv);
 
-if (program.args.length === 0) {
+if (program.args.length === 0 && !program.title) {
console.error('A bundle filename or directory is required.');
return 1;
 }
@@ -73,7 +78,15 @@
log: log
 };
 
-texter.convert(options).catch(function(err) {
+var p;
+if (!program.title) {
+   p = texter.convert(options);
+} else {
+   options.domain = program.domain || 'en.wikipedia.org';
+   options.title = program.title;
+   p = require('../lib/standalone').convert(options);
+}
+p.catch(function(err) {
var msg = {
type: 'log',
level: 'error'
diff --git a/lib/index.js b/lib/index.js
index d01a1fa..9c20729 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -992,10 +992,10 @@

format.writeSummary(textEscape(metabook.summary).replace(/\s+/g, ' '));
}
 
-   var pdb = new Db(
+   var pdb = options.pdb || new Db(
path.join(builddir, 'bundle', 'parsoid.db'), { readonly: true }
);
-   var sidb = new Db(
+   var sidb = options.sidb || new Db(
path.join(builddir, 'bundle', 'siteinfo.db'), { readonly: true }
);
var write = {};
@@ -1006,7 +1006,9 @@
var document, base = '', articleLanguage;
var key = (item.wiki ? (item.wiki+'|') : '') + revid;
return pdb.get(key, 'nojson').then(function(data) {
-   document = domino.createDocument(data);
+   // avoid redundant parsing in standalone mode (which 
will
+   // return an object with a `document` property)
+   document = data.document || domino.createDocument(data);
var baseElem = document.querySelector('head > 
base[href]');
if (baseElem) {
base = baseElem.getAttribute('href').
@@ -1109,5 +,6 @@
 module.exports = {
name: json.name, // package name
version: json.version, // version # for this package
-   convert: convert
+   convert: convert,
+   generateOutput: generateOutput // for use by standalone.js
 };
diff --git a/lib/standalone.js b/lib/standalone.js
new file mode 100644
index 000..d6482ae
--- /dev/null
+++ b/lib/standalone.js
@@ -0,0 +1,46 @@
+"use strict";
+require('core-js/shim');
+var Promise = require('prfun');
+var main = require('./');
+var bundler = require('mw-ocg-bundler');
+
+var convert = module.exports.convert = function(options) {
+   // make metabook.
+   return bundler.metabook.fromArticles([{
+   prefix: options.prefix,
+   domain: options.domain,
+   title: options.title
+   }], options).then(function(metabook) {
+   var item = metabook.items[0];
+   var Parsoid = new bundler.parsoid(
+   metabook.wikis, options.apiVersion, options.log
+   );
+   var siteinfo = options.siteinfo || new bundler.siteinfo(
+   metabook.wikis, options.log
+   );
+   return siteinfo.fetch(item.wiki).then(function(si) {
+   return Parsoid.fetch(si, item.wiki, item.title, null, 
2).then(function(pr) {
+   var opts = Object.create(options);
+   item.wiki = pr.wiki;
+   item.title = pr.title;
+   item.revision = pr.getRevisionId();
+   // fake a db
+   opts.pdb = {
+