Cscott has uploaded a new change for review. https://gerrit.wikimedia.org/r/107393
Change subject: Improve internal documentation. ...................................................................... Improve internal documentation. Change-Id: Idff981db81ad4ba2aaa80e9dd3d45b872f9512cf --- M lib/db.js M lib/index.js 2 files changed, 136 insertions(+), 48 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Collection/OfflineContentGenerator/latex_renderer refs/changes/93/107393/1 diff --git a/lib/db.js b/lib/db.js index 3be467e..56b41aa 100644 --- a/lib/db.js +++ b/lib/db.js @@ -1,4 +1,4 @@ -// helpers to create key/value mappings in sqlite db +// Helpers to create/read key/value mappings in sqlite db var sqlite3 = require('sqlite3'); var when = require('when'); @@ -40,7 +40,7 @@ }); }; -// Returns a promise for the value. +// Returns a promise for the value associated with a given key. Db.prototype.get = function(key, nojson) { return this.db.then(function(db) { return P.call( @@ -54,8 +54,9 @@ }); }; +// Call the given function `f` once for each row in the database. // Returns a promise which will be resolved (with the number of keys) -// when the iteration is complete +// when the iteration is complete. Db.prototype.forEach = function(f, nojson) { var each = function(err, row) { var val = nojson ? row.val : JSON.parse(row.val); diff --git a/lib/index.js b/lib/index.js index cf5a845..6b30a65 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,3 +1,5 @@ +// Convert bundles to PDFs via LaTeX. +// --------------------------------------------------------------------- require('es6-shim'); var json = require('../package.json'); @@ -153,18 +155,27 @@ return str; }; -// helper -var updateLTRfont = function(format, poly) { - if (poly.dir==='rtl' && SCRIPT_FONTS[poly.script]) { - format.writeDecorated( - '\\renewcommand{\\LTRfont}' + - '{\\LTR'+poly.script.toLowerCase()+'font}' - ); - format.envBreak(); - format.resetSOL(); - } -}; +// --------------------------------------------------------------------- +/** + * The `Formatter` class tracks the details of LaTeX syntax, in particular + * what LaTeX calls 'paragraph mode', 'math mode', and 'LR mode'. It ensures + * that we don't try to make a line break if we haven't started a paragraph, + * and that we don't try to break a line or paragraph if we're in LR mode + * (basically, that we're inside the {} of a command argument). + * It *also* implements the Unicode Bidirectional Algorithm (using the + * node-icu-bidi package) to explicitly tag LTR and RTL runs, and contains + * a few workarounds to prevent XeTeX's "almost the Unicode bidi algorithm" + * implementation from screwing with things. (See + * http://tug.org/pipermail/xetex/2013-December/024964.html and + * http://tug.org/pipermail/xetex/2014-January/025086.html for more detail.) + * + * In the future this class might also need to handle font switching based + * on code blocks, since the fonts used for many languages do not have + * great coverage. I tried using the ucharclasses package for this, but + * it fought with polyglossia and slowed down LaTeX processing by a factor + * of 6x. + */ var Formatter = function(stream, options) { this.stream = stream; this.options = options; @@ -186,29 +197,38 @@ // should use when we compute the next set of runs. this.paragraphDir = this.contextDir; }; +/** + * Used to finish up output; writes all buffered text to a stream and + * returns a promise which will be resolved when the write is complete. + */ Formatter.prototype.flush = function() { var deferred = when.defer(); this.envBreak(); console.assert(this.stack.length === 0); // all inline styles closed - this.stream.write('', 'utf8', function() { - deferred.resolve(); + this.stream.write('', 'utf8', function(err) { + err ? deferred.reject(err) : deferred.resolve(); }); return deferred.promise; }; +// Internal: Write the given string directly to the output. Formatter.prototype._writeRaw = function(text) { this.stream.write(text, 'utf8'); }; +// This is the main workhorse of this class. It takes the queued strings +// (in `this.buffer`) and decorations (in `this.decorations`), runs the +// Unicode BiDi algorithm, and emits runs of TeX-escaped LTR/RTL text, with +// the raw LaTeX commands ('decorations') interspersed appropriately. Formatter.prototype._writeRuns = function() { var text = this.buffer.join(''); if (text.length === 0 && this.decorations.length === 0) { return; // nothing to do } this._addDecoration({ type: 'end' }); // sentinel - // get logical directionality runs in this text. + // compute directionality runs in this text. var p = new ubidi.Paragraph(text, { paraLevel: (this.paragraphDir==='ltr') ? ubidi.DEFAULT_LTR : ubidi.DEFAULT_RTL }); - + // helper: emit a decoration start/end with appropriate delimiters. var emitDecoration = function(d, opts) { /* jshint bitwise: false */ // xor operator is okay. switch (d.type) { @@ -309,18 +329,28 @@ // done; clear all the buffers this.buffer.length = this.decorations.length = this.pos = 0; }; +/** Tell the formatter this should be treated as a "start of line" (also + * "start of environment" and "start of paragraph") context. + * Used to reset formatter state after we've added some LaTeX decorations + * that don't emit text. + */ Formatter.prototype.resetSOL = function() { this.newEnv = this.newLine = this.newPara = true; }; +/** Flush the formatter buffers and indicate that this is a good place to + * change the text directionality, if necessary. */ Formatter.prototype.dirBreak = function() { this._writeRuns(); }; +/** Add an "environment break": make this a good place to start/end an + * environment. */ Formatter.prototype.envBreak = function() { if (this.newEnv) { return; } this.dirBreak(); this._writeRaw('\n'); this.newEnv = true; }; +/** Add a paragraph break. */ Formatter.prototype.paragraphBreak = function() { if (this.newPara) { return; } this.envBreak(); @@ -329,17 +359,20 @@ // this is a good place to change the xetex default bidi context dir this.dirBreak(); this.startPara = true; }; +/** Add a hard line break (only allowed within a paragraph). */ Formatter.prototype.lineBreak = function() { if (this.newLine) { return; } this.envBreak(); this._writeRaw('\\\\\n'); this.newLine = true; }; +// Internal: bookkeeping for decorations. Formatter.prototype._addDecoration = function(d) { d.pos = this.pos; this.decorations.push(d); this.newEnv = this.newLine = this.newPara = false; }; +/** Add the given literal text to the output. */ Formatter.prototype.write = function(text) { if (this.newEnv || this.newLine || this.newPara) { text = text.replace(/^\s+/, ''); // kill leading space after nl @@ -350,6 +383,12 @@ this.buffer.push(text); this.pos += text.length; }; +/** + * Add some decorated text. If `text` is omitted, this is a raw or block + * decoration. Otherwise, we will add a new inline decoration around the + * given text. `text` can be a function in that case, which is expected + * to compute the text to be added. + */ Formatter.prototype.writeDecorated = function(decoration, text) { if (text === undefined) { if (typeof(decoration)==='string') { @@ -400,6 +439,19 @@ } this.paragraphDir = dir; }; +// helper to reset LTR font in new rtl context +var updateLTRfont = function(format, poly) { + if (poly.dir==='rtl' && SCRIPT_FONTS[poly.script]) { + format.writeDecorated( + '\\renewcommand{\\LTRfont}' + + '{\\LTR'+poly.script.toLowerCase()+'font}' + ); + format.envBreak(); + format.resetSOL(); + } +}; + +// --------------------------------------------------------------------- // Predicate to determine whether the given element will be a // paragraph context in LaTeX. @@ -445,15 +497,14 @@ return false; }; -// LEVELS OF LATEX HIERARCHY -var LATEX_LEVELS = [ - 'chapter', 'section', 'subsection', 'subsubsection', 'paragraph', - // bottom out the hierarchy at subparagraph - 'subparagraph', 'subparagraph', 'subparagraph', 'subparagraph' -]; +// --------------------------------------------------------------------- -/* Document node visitor class. Collects LaTeX output as it traverses the - * document tree. */ +/** + * The `Visitor` class encapsulates most of the logic of HTML->LaTeX + * translation. It traverses the wikitext DOM tree and generates LaTeX + * output as it goes. It tracks inherited language and directionality + * information as it descends. + */ var Visitor = function(document, format, options) { this.document = document; this.format = format; @@ -631,6 +682,14 @@ /* jshint unused: vars */ this.format.write('\u200B'); // ZERO WIDTH SPACE }; + +// Levels of LaTeX sectioning hierarchy. +// Used when translating <h1>, <h2>, etc. +var LATEX_LEVELS = [ + 'chapter', 'section', 'subsection', 'subsubsection', 'paragraph', + // bottom out the hierarchy at subparagraph + 'subparagraph', 'subparagraph', 'subparagraph', 'subparagraph' +]; // H1s are "at the same level as the page title". // Don't allow them in single item collections, as the article class doesn't @@ -1000,9 +1059,19 @@ // --------------------------------------------------------------------- // Bundle, image, and file processing +// +// This code is largely asynchronous. It chains promises together +// to manage the concurrency without callback hell. It uses the +// Promises/A+ api, from the npm `when` package. Most promises +// implementations are roughly equivalent; we're using `when` because +// it has a nice implementation of guards (`when/guard`) which is +// an easy way to limit the maximum parallelism of a task to ensure +// we don't spam Mediawiki's API with hundreds of requests at once. +// We also use `P`, a set of helpers for promises that make it easier +// to work with methods which accept node-style callbacks. -// return a promise for the builddir and control file contents -// (after the bundle has been unpacked) +// Step 1: unpack a bundle, and return a promise for the builddir +// and control file contents. var unpackBundle = function(options) { var metabook, builddir, status = options.status; @@ -1037,20 +1106,22 @@ }); }; -// return a promise to have renamed a file. uses 'guard' to ensure that -// renames aren't executed in parallel (and thus we can ensure that -// filenames are unique without tying ourself in knots). Returns the -// new name (which might differ from the basename given) +// Helper: rename a file. If the desired filename already exists, then +// pick a new unique name (based on `newbase`). Uses `guard` to +// ensure that renames aren't executed in parallel, and thus we can +// ensure that filenames are unique without tying ourself in knots. +// Returns a promise for the new name (which might differ from both +// `oldname` and `newbase`). var renameFile = guard(guard.n(1), function(dir, oldname, newbase) { var exists = function(path, cb) { - // fs exists doesn't take the usual 'err' as 1st argument. fix it. + // fs.exists doesn't take the usual 'err' as 1st argument. fix that. this.exists(path, function(exists) { cb(null, exists); }); }; return P.call(exists, fs, path.join(dir, newbase)).then(function(exists) { if (!exists) { return path.join(dir, newbase); } - // use the tmp module to come up with a unique alternative + // use the tmp module to come up with a unique alternative name return P.call(tmp.tmpName, tmp, { dir: dir, prefix: '', @@ -1108,7 +1179,6 @@ }); }); - // Remove JFIF resolution information from JPGs; bogus resolution information // can cause LaTeX to abort with a "dimension too large" error if the // computed "actual size" of the image is enormous (regardless of the fact @@ -1125,9 +1195,9 @@ }); }); - -// return a promise for a map from file resource URLs to on-disk filenames -// (after image processing / renaming has been done) +// Step 2: process and rename images. +// Return a promise for a map from file resource URLs to on-disk filenames +// (after image processing / renaming has been done). var processImages = function(metabook, builddir, options) { var status = options.status; var imagedir = path.join(builddir, 'bundle', 'images'); @@ -1183,7 +1253,7 @@ }); p = when.join(p, pp); // serialize completion }).then(function() { - // do the queued image renames/conversions/etc. + // wait for the queued image renames/conversions/etc. return p; }).then(function() { // return the promised imagemap @@ -1191,14 +1261,17 @@ }); }; -// count total # of items (used for status reporting) +// Helper: count total # of items in tree (used for status reporting) var countItems = function(item) { return (item.items || []).reduce(function(sum, item) { return sum + countItems(item); }, 1); }; -// Return an empty promise after the output.tex file has been written. +// Step 3: generate a LaTeX file for each article, and another top-level +// one (`output.tex`) to tie everything together. +// Return a promise which will be resolved (with no value) after all the +// files have been written. var generateLatex = function(metabook, builddir, imagemap, options) { var status = options.status; status.createStage(countItems(metabook), 'Processing collection'); @@ -1264,6 +1337,8 @@ head += '\n'; output.write(head); + // Now recurse through the item tree generating .tex files for each + // article. var pdb = new Db( path.join(builddir, 'bundle', 'parsoid.db'), { readonly: true } ); @@ -1362,11 +1437,13 @@ if (!font || !font.name) { return; } var options = font.opts ? (',' + font.opts) : ''; if (font.cjk) { + // use xeCJK to manage CJK font switching s += '\\setCJKfamilyfont{'+p.lang+'}' + '[Script=' + p.script + options + ']{' + font.name + '}\n'; s += '\\newcommand{\\' + p.lang + 'font' + '}' + '{\\CJKfamily{' + p.lang + '}}\n'; } else { + // polyglossia font management s += '\\newfontfamily\\' + p.lang + 'font' + '[Script=' + p.script + options + ']{' + font.name + '}\n'; } @@ -1380,12 +1457,14 @@ '[Script=' + script + options + ']{' + font.name + '}\n'; // for rtl scripts, add a version which turns off fancy script // features, which we will use for embedded ltr regions + // see http://tug.org/pipermail/xetex/2014-January/025113.html if (!rtl.has(script)) { return; } s += '\\newfontfamily\\LTR' + script.toLowerCase() + 'font' + '{' + font.name + '}\n'; }); - // hackity hack: initialize the LTRfont for the collection language + // initialize the LTRfont for the main collection language updateLTRfont({ + // hackity hack: this is a trivial Formatter writeDecorated: function(ss) { s += ss; }, envBreak: function() {}, resetSOL: function() {} @@ -1396,8 +1475,8 @@ }); }; -// Return an empty promise after the latex has been either written or -// compiled to a PDF. +// Step 4: write LaTeX stub and/or compile to a PDF. +// Return a promise which will be resolved with no value when complete. var compileLatex = function(builddir, options) { var status = options.status; status.createStage(0, 'Compiling PDF'); @@ -1443,8 +1522,16 @@ } }; -// Return a promise for an exit status (0 for success) after the bundle -// specified in the options has been converted. +// --------------------------------------------------------------------- + +/** + * Main entry point. + * + * Convert a bundle to LaTeX and/or a PDF, respecting the given `options`. + * + * Return a promise for an exit status (0 for success) after the bundle + * specified in the options has been converted. + */ var convert = function(options) { var status = options.status = new StatusReporter(4, function(msg) { if (options.log) { @@ -1478,13 +1565,13 @@ if (options.debug) { throw err; } - // xxx send this error to parent process? + // xxx send this error to parent process, if there is one? console.error('Error:', err); return 1; }); }; module.exports = { - version: json.version, // version # for this code + version: json.version, // version # for this package convert: convert }; -- To view, visit https://gerrit.wikimedia.org/r/107393 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idff981db81ad4ba2aaa80e9dd3d45b872f9512cf Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Collection/OfflineContentGenerator/latex_renderer Gerrit-Branch: master Gerrit-Owner: Cscott <canan...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits