Cscott has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/107393


Change subject: Improve internal documentation.
......................................................................

Improve internal documentation.

Change-Id: Idff981db81ad4ba2aaa80e9dd3d45b872f9512cf
---
M lib/db.js
M lib/index.js
2 files changed, 136 insertions(+), 48 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Collection/OfflineContentGenerator/latex_renderer
 refs/changes/93/107393/1

diff --git a/lib/db.js b/lib/db.js
index 3be467e..56b41aa 100644
--- a/lib/db.js
+++ b/lib/db.js
@@ -1,4 +1,4 @@
-// helpers to create key/value mappings in sqlite db
+// Helpers to create/read key/value mappings in sqlite db
 
 var sqlite3 = require('sqlite3');
 var when = require('when');
@@ -40,7 +40,7 @@
        });
 };
 
-// Returns a promise for the value.
+// Returns a promise for the value associated with a given key.
 Db.prototype.get = function(key, nojson) {
        return this.db.then(function(db) {
                return P.call(
@@ -54,8 +54,9 @@
        });
 };
 
+// Call the given function `f` once for each row in the database.
 // Returns a promise which will be resolved (with the number of keys)
-// when the iteration is complete
+// when the iteration is complete.
 Db.prototype.forEach = function(f, nojson) {
        var each = function(err, row) {
                var val = nojson ? row.val : JSON.parse(row.val);
diff --git a/lib/index.js b/lib/index.js
index cf5a845..6b30a65 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -1,3 +1,5 @@
+// Convert bundles to PDFs via LaTeX.
+// ---------------------------------------------------------------------
 require('es6-shim');
 
 var json = require('../package.json');
@@ -153,18 +155,27 @@
        return str;
 };
 
-// helper
-var updateLTRfont = function(format, poly) {
-       if (poly.dir==='rtl' && SCRIPT_FONTS[poly.script]) {
-               format.writeDecorated(
-                       '\\renewcommand{\\LTRfont}' +
-                               '{\\LTR'+poly.script.toLowerCase()+'font}'
-               );
-               format.envBreak();
-               format.resetSOL();
-       }
-};
+// ---------------------------------------------------------------------
 
+/**
+ * The `Formatter` class tracks the details of LaTeX syntax, in particular
+ * what LaTeX calls 'paragraph mode', 'math mode', and 'LR mode'.  It ensures
+ * that we don't try to make a line break if we haven't started a paragraph,
+ * and that we don't try to break a line or paragraph if we're in LR mode
+ * (basically, that we're inside the {} of a command argument).
+ * It *also* implements the Unicode Bidirectional Algorithm (using the
+ * node-icu-bidi package) to explicitly tag LTR and RTL runs, and contains
+ * a few workarounds to prevent XeTeX's "almost the Unicode bidi algorithm"
+ * implementation from screwing with things. (See
+ * http://tug.org/pipermail/xetex/2013-December/024964.html and
+ * http://tug.org/pipermail/xetex/2014-January/025086.html for more detail.)
+ *
+ * In the future this class might also need to handle font switching based
+ * on code blocks, since the fonts used for many languages do not have
+ * great coverage.  I tried using the ucharclasses package for this, but
+ * it fought with polyglossia and slowed down LaTeX processing by a factor
+ * of 6x.
+ */
 var Formatter = function(stream, options) {
        this.stream = stream;
        this.options = options;
@@ -186,29 +197,38 @@
        // should use when we compute the next set of runs.
        this.paragraphDir = this.contextDir;
 };
+/**
+ * Used to finish up output; writes all buffered text to a stream and
+ * returns a promise which will be resolved when the write is complete.
+ */
 Formatter.prototype.flush = function() {
        var deferred = when.defer();
        this.envBreak();
        console.assert(this.stack.length === 0); // all inline styles closed
-       this.stream.write('', 'utf8', function() {
-               deferred.resolve();
+       this.stream.write('', 'utf8', function(err) {
+               err ? deferred.reject(err) : deferred.resolve();
        });
        return deferred.promise;
 };
+// Internal: Write the given string directly to the output.
 Formatter.prototype._writeRaw = function(text) {
        this.stream.write(text, 'utf8');
 };
+// This is the main workhorse of this class. It takes the queued strings
+// (in `this.buffer`) and decorations (in `this.decorations`), runs the
+// Unicode BiDi algorithm, and emits runs of TeX-escaped LTR/RTL text, with
+// the raw LaTeX commands ('decorations') interspersed appropriately.
 Formatter.prototype._writeRuns = function() {
        var text = this.buffer.join('');
        if (text.length === 0 && this.decorations.length === 0) {
                return; // nothing to do
        }
        this._addDecoration({ type: 'end' }); // sentinel
-       // get logical directionality runs in this text.
+       // compute directionality runs in this text.
        var p = new ubidi.Paragraph(text, {
                paraLevel: (this.paragraphDir==='ltr') ? ubidi.DEFAULT_LTR : 
ubidi.DEFAULT_RTL
        });
-
+       // helper: emit a decoration start/end with appropriate delimiters.
        var emitDecoration = function(d, opts) {
                /* jshint bitwise: false */ // xor operator is okay.
                switch (d.type) {
@@ -309,18 +329,28 @@
        // done; clear all the buffers
        this.buffer.length = this.decorations.length = this.pos = 0;
 };
+/** Tell the formatter this should be treated as a "start of line" (also
+ * "start of environment" and "start of paragraph") context.
+ * Used to reset formatter state after we've added some LaTeX decorations
+ * that don't emit text.
+ */
 Formatter.prototype.resetSOL = function() {
        this.newEnv = this.newLine = this.newPara = true;
 };
+/** Flush the formatter buffers and indicate that this is a good place to
+ *  change the text directionality, if necessary. */
 Formatter.prototype.dirBreak = function() {
        this._writeRuns();
 };
+/** Add an "environment break": make this a good place to start/end an
+ *  environment. */
 Formatter.prototype.envBreak = function() {
        if (this.newEnv) { return; }
        this.dirBreak();
        this._writeRaw('\n');
        this.newEnv = true;
 };
+/** Add a paragraph break. */
 Formatter.prototype.paragraphBreak = function() {
        if (this.newPara) { return; }
        this.envBreak();
@@ -329,17 +359,20 @@
        // this is a good place to change the xetex default bidi context dir
        this.dirBreak(); this.startPara = true;
 };
+/** Add a hard line break (only allowed within a paragraph). */
 Formatter.prototype.lineBreak = function() {
        if (this.newLine) { return; }
        this.envBreak();
        this._writeRaw('\\\\\n');
        this.newLine = true;
 };
+// Internal: bookkeeping for decorations.
 Formatter.prototype._addDecoration = function(d) {
        d.pos = this.pos;
        this.decorations.push(d);
        this.newEnv = this.newLine = this.newPara = false;
 };
+/** Add the given literal text to the output. */
 Formatter.prototype.write = function(text) {
        if (this.newEnv || this.newLine || this.newPara) {
                text = text.replace(/^\s+/, ''); // kill leading space after nl
@@ -350,6 +383,12 @@
        this.buffer.push(text);
        this.pos += text.length;
 };
+/**
+ * Add some decorated text.  If `text` is omitted, this is a raw or block
+ * decoration.  Otherwise, we will add a new inline decoration around the
+ * given text.  `text` can be a function in that case, which is expected
+ * to compute the text to be added.
+ */
 Formatter.prototype.writeDecorated = function(decoration, text) {
        if (text === undefined) {
                if (typeof(decoration)==='string') {
@@ -400,6 +439,19 @@
        }
        this.paragraphDir = dir;
 };
+// helper to reset LTR font in new rtl context
+var updateLTRfont = function(format, poly) {
+       if (poly.dir==='rtl' && SCRIPT_FONTS[poly.script]) {
+               format.writeDecorated(
+                       '\\renewcommand{\\LTRfont}' +
+                               '{\\LTR'+poly.script.toLowerCase()+'font}'
+               );
+               format.envBreak();
+               format.resetSOL();
+       }
+};
+
+// ---------------------------------------------------------------------
 
 // Predicate to determine whether the given element will be a
 // paragraph context in LaTeX.
@@ -445,15 +497,14 @@
        return false;
 };
 
-// LEVELS OF LATEX HIERARCHY
-var LATEX_LEVELS = [
-       'chapter', 'section', 'subsection', 'subsubsection', 'paragraph',
-       // bottom out the hierarchy at subparagraph
-       'subparagraph', 'subparagraph', 'subparagraph', 'subparagraph'
-];
+// ---------------------------------------------------------------------
 
-/* Document node visitor class.  Collects LaTeX output as it traverses the
- * document tree. */
+/**
+ * The `Visitor` class encapsulates most of the logic of HTML->LaTeX
+ * translation.  It traverses the wikitext DOM tree and generates LaTeX
+ * output as it goes.  It tracks inherited language and directionality
+ * information as it descends.
+ */
 var Visitor = function(document, format, options) {
        this.document = document;
        this.format = format;
@@ -631,6 +682,14 @@
        /* jshint unused: vars */
        this.format.write('\u200B'); // ZERO WIDTH SPACE
 };
+
+// Levels of LaTeX sectioning hierarchy.
+// Used when translating <h1>, <h2>, etc.
+var LATEX_LEVELS = [
+       'chapter', 'section', 'subsection', 'subsubsection', 'paragraph',
+       // bottom out the hierarchy at subparagraph
+       'subparagraph', 'subparagraph', 'subparagraph', 'subparagraph'
+];
 
 // H1s are "at the same level as the page title".
 // Don't allow them in single item collections, as the article class doesn't
@@ -1000,9 +1059,19 @@
 
 // ---------------------------------------------------------------------
 // Bundle, image, and file processing
+//
+// This code is largely asynchronous.  It chains promises together
+// to manage the concurrency without callback hell.  It uses the
+// Promises/A+ api, from the npm `when` package.  Most promises
+// implementations are roughly equivalent; we're using `when` because
+// it has a nice implementation of guards (`when/guard`) which is
+// an easy way to limit the maximum parallelism of a task to ensure
+// we don't spam Mediawiki's API with hundreds of requests at once.
+// We also use `P`, a set of helpers for promises that make it easier
+// to work with methods which accept node-style callbacks.
 
-// return a promise for the builddir and control file contents
-// (after the bundle has been unpacked)
+// Step 1: unpack a bundle, and return a promise for the builddir
+// and control file contents.
 var unpackBundle = function(options) {
        var metabook, builddir, status = options.status;
 
@@ -1037,20 +1106,22 @@
        });
 };
 
-// return a promise to have renamed a file.  uses 'guard' to ensure that
-// renames aren't executed in parallel (and thus we can ensure that
-// filenames are unique without tying ourself in knots).  Returns the
-// new name (which might differ from the basename given)
+// Helper: rename a file.  If the desired filename already exists, then
+// pick a new unique name (based on `newbase`).  Uses `guard` to
+// ensure that renames aren't executed in parallel, and thus we can
+// ensure that filenames are unique without tying ourself in knots.
+// Returns a promise for the new name (which might differ from both
+// `oldname` and `newbase`).
 var renameFile = guard(guard.n(1), function(dir, oldname, newbase) {
        var exists = function(path, cb) {
-               // fs exists doesn't take the usual 'err' as 1st argument.  fix 
it.
+               // fs.exists doesn't take the usual 'err' as 1st argument.  fix 
that.
                this.exists(path, function(exists) { cb(null, exists); });
        };
        return P.call(exists, fs, path.join(dir, 
newbase)).then(function(exists) {
                if (!exists) {
                        return path.join(dir, newbase);
                }
-               // use the tmp module to come up with a unique alternative
+               // use the tmp module to come up with a unique alternative name
                return P.call(tmp.tmpName, tmp, {
                        dir: dir,
                        prefix: '',
@@ -1108,7 +1179,6 @@
        });
 });
 
-
 // Remove JFIF resolution information from JPGs; bogus resolution information
 // can cause LaTeX to abort with a "dimension too large" error if the
 // computed "actual size" of the image is enormous (regardless of the fact
@@ -1125,9 +1195,9 @@
        });
 });
 
-
-// return a promise for a map from file resource URLs to on-disk filenames
-// (after image processing / renaming has been done)
+// Step 2: process and rename images.
+// Return a promise for a map from file resource URLs to on-disk filenames
+// (after image processing / renaming has been done).
 var processImages = function(metabook, builddir, options) {
        var status = options.status;
        var imagedir = path.join(builddir, 'bundle', 'images');
@@ -1183,7 +1253,7 @@
                });
                p = when.join(p, pp); // serialize completion
        }).then(function() {
-               // do the queued image renames/conversions/etc.
+               // wait for the queued image renames/conversions/etc.
                return p;
        }).then(function() {
                // return the promised imagemap
@@ -1191,14 +1261,17 @@
        });
 };
 
-// count total # of items (used for status reporting)
+// Helper: count total # of items in tree (used for status reporting)
 var countItems = function(item) {
        return (item.items || []).reduce(function(sum, item) {
                return sum + countItems(item);
        }, 1);
 };
 
-// Return an empty promise after the output.tex file has been written.
+// Step 3: generate a LaTeX file for each article, and another top-level
+// one (`output.tex`) to tie everything together.
+// Return a promise which will be resolved (with no value) after all the
+// files have been written.
 var generateLatex = function(metabook, builddir, imagemap, options) {
        var status = options.status;
        status.createStage(countItems(metabook), 'Processing collection');
@@ -1264,6 +1337,8 @@
        head += '\n';
        output.write(head);
 
+       // Now recurse through the item tree generating .tex files for each
+       // article.
        var pdb = new Db(
                path.join(builddir, 'bundle', 'parsoid.db'), { readonly: true }
        );
@@ -1362,11 +1437,13 @@
                        if (!font || !font.name) { return; }
                        var options = font.opts ? (',' + font.opts) : '';
                        if (font.cjk) {
+                               // use xeCJK to manage CJK font switching
                                s += '\\setCJKfamilyfont{'+p.lang+'}' +
                                        '[Script=' + p.script + options + ']{' 
+ font.name + '}\n';
                                s += '\\newcommand{\\' + p.lang + 'font' + '}' +
                                        '{\\CJKfamily{' + p.lang + '}}\n';
                        } else {
+                               // polyglossia font management
                                s += '\\newfontfamily\\' + p.lang + 'font' +
                                        '[Script=' + p.script + options + ']{' 
+ font.name + '}\n';
                        }
@@ -1380,12 +1457,14 @@
                                '[Script=' + script + options + ']{' + 
font.name + '}\n';
                        // for rtl scripts, add a version which turns off fancy 
script
                        // features, which we will use for embedded ltr regions
+                       // see 
http://tug.org/pipermail/xetex/2014-January/025113.html
                        if (!rtl.has(script)) { return; }
                        s += '\\newfontfamily\\LTR' + script.toLowerCase() + 
'font' +
                                '{' + font.name + '}\n';
                });
-               // hackity hack: initialize the LTRfont for the collection 
language
+               // initialize the LTRfont for the main collection language
                updateLTRfont({
+                       // hackity hack: this is a trivial Formatter
                        writeDecorated: function(ss) { s += ss; },
                        envBreak: function() {},
                        resetSOL: function() {}
@@ -1396,8 +1475,8 @@
        });
 };
 
-// Return an empty promise after the latex has been either written or
-// compiled to a PDF.
+// Step 4: write LaTeX stub and/or compile to a PDF.
+// Return a promise which will be resolved with no value when complete.
 var compileLatex = function(builddir, options) {
        var status = options.status;
        status.createStage(0, 'Compiling PDF');
@@ -1443,8 +1522,16 @@
        }
 };
 
-// Return a promise for an exit status (0 for success) after the bundle
-// specified in the options has been converted.
+// ---------------------------------------------------------------------
+
+/**
+ * Main entry point.
+ *
+ * Convert a bundle to LaTeX and/or a PDF, respecting the given `options`.
+ *
+ * Return a promise for an exit status (0 for success) after the bundle
+ * specified in the options has been converted.
+ */
 var convert = function(options) {
        var status = options.status = new StatusReporter(4, function(msg) {
                if (options.log) {
@@ -1478,13 +1565,13 @@
                if (options.debug) {
                        throw err;
                }
-               // xxx send this error to parent process?
+               // xxx send this error to parent process, if there is one?
                console.error('Error:', err);
                return 1;
        });
 };
 
 module.exports = {
-       version: json.version, // version # for this code
+       version: json.version, // version # for this package
        convert: convert
 };

-- 
To view, visit https://gerrit.wikimedia.org/r/107393
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idff981db81ad4ba2aaa80e9dd3d45b872f9512cf
Gerrit-PatchSet: 1
Gerrit-Project: 
mediawiki/extensions/Collection/OfflineContentGenerator/latex_renderer
Gerrit-Branch: master
Gerrit-Owner: Cscott <canan...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to