jenkins-bot has submitted this change and it was merged.

Change subject: Non-English language support via polyglossia.
......................................................................


Non-English language support via polyglossia.

Add xunicode and polyglossia packages for better language support.
Process HTML 'lang' attributes and emit the proper polyglossia
language-change command.  Add texlive-lang-all package to get all the
appropriate hyphenation patterns, etc.

Set default font to GNU freefont's "FreeSerif" face, which has better
non-latin support (but no CJK characters).

Change-Id: I08454de9c33e5971c76db2313b4bc2a48e74af1d
---
M .travis.yml
M README.md
M bin/mw-latexer
M lib/index.js
A lib/polyglossia.js
5 files changed, 168 insertions(+), 5 deletions(-)

Approvals:
  Cscott: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/.travis.yml b/.travis.yml
index 24388ca..786e386 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,7 @@
   - "0.8"
 before_install:
  - sudo apt-get update -qq
- - sudo apt-get install -qq texlive-xetex texlive-latex-recommended 
texlive-fonts-recommended latex-xcolor imagemagick librsvg2-bin unzip
+ - sudo apt-get install -qq texlive-xetex texlive-latex-recommended 
texlive-fonts-recommended texlive-lang-all latex-xcolor imagemagick 
librsvg2-bin unzip
  - mkdir ~/texmf
  - unzip -d ~/texmf texdeps/fontspec.tds.zip
  - cp -r ~/texmf/tex/latex ~/texmf/tex/xelatex
diff --git a/README.md b/README.md
index a75cbc9..4db4fb6 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,9 @@
 
 Install other system dependencies.
 ```
-apt-get install texlive-xetex texlive-latex-recommended 
texlive-fonts-recommended latex-xcolor imagemagick librsvg2-bin unzip
+apt-get install texlive-xetex texlive-latex-recommended \
+               texlive-fonts-recommended texlive-lang-all latex-xcolor \
+               imagemagick librsvg2-bin unzip
 ```
 
 Note that up-to-date LaTeX `hyperref` and `fontspec` packages are
diff --git a/bin/mw-latexer b/bin/mw-latexer
index 528dd66..7c0d754 100755
--- a/bin/mw-latexer
+++ b/bin/mw-latexer
@@ -12,6 +12,8 @@
                        'Set paper size', 'letter')
        .option('-t, --toc <yes|no|auto>',
                        'Force presence/absence of table of contents [auto]', 
'auto')
+       .option('-L, --lang <2-char lang code>',
+                       'Force collection to use the given default language', 
null)
        .option('-1, --one-column',
                        'Render page in single column layout')
        .option('-v, --verbose',
@@ -47,6 +49,7 @@
        latex: !!program.latex,
        debug: !!program.debug,
        output: program.output,
+       lang: program.lang,
        onecolumn: program.oneColumn,
        log: log
 };
diff --git a/lib/index.js b/lib/index.js
index 2b6b4e5..1e691f6 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -22,6 +22,7 @@
 var Db = require('./db');
 var DomUtil = require('./domutil');
 var P = require('./p');
+var Polyglossia = require('./polyglossia');
 var StatusReporter = require('./status');
 
 var STD_HEADER = [
@@ -30,7 +31,7 @@
        "",
        "\\documentclass[10pt,twocolumn,twoside]{article}",
        "\\pagestyle{headings}",
-       "\\usepackage{fontspec, graphicx}",
+       "\\usepackage{fontspec, xunicode, polyglossia, graphicx}",
        "\\usepackage{amsmath,amsthm,amstext,amssymb}",
        "\\usepackage[usenames]{xcolor}",
        "\\definecolor{linkcolor}{rgb}{.27,0,0}",
@@ -49,6 +50,8 @@
        "\\makeatother%",
        "}",
 */
+       "\\setmainfont[]{FreeSerif}",
+/*
        // Set up Charis font
        // XXX add non-latin (CJK, etc) fonts
        "\\setmainfont[",
@@ -59,6 +62,7 @@
        "BoldItalicFont = CharisSIL-BI.ttf ,",
        "]",
        "{CharisSIL-R.ttf}",
+*/
        "\\date{}\\author{}"
 ].join("\n");
 
@@ -144,6 +148,8 @@
        this.output = [];
        this.templates = Object.create(null);
        this.base = options.base || '';
+       this.currentLanguage = options.lang || 'en';
+       this.usedLanguages = new Set();
 };
 
 // Helper function -- collect all text from the children of `node` as
@@ -172,6 +178,18 @@
        case node.ELEMENT_NODE:
                if (isHidden(node)) {
                        return;
+               }
+               // handle LANG attributes (which override everything else)
+               var lang = node.getAttribute('lang') || this.currentLanguage;
+               // in addition to eliminating no-ops, this condition allows us
+               // to recursively invoke visit() inside the LANG handler.
+               if (lang !== this.currentLanguage) {
+                       var savedLanguage = this.currentLanguage;
+                       this.currentLanguage = lang;
+                       this.usedLanguages.add(lang);
+                       var r = this['visitLANG='].apply(this, arguments);
+                       this.currentLanguage = savedLanguage;
+                       return r;
                }
                // use typeof property if possible
                if (node.hasAttribute('typeof')) {
@@ -238,7 +256,7 @@
                        href = href.substring(1);
                        return this.collect(node, function(contents) {
                                this.output.push('\\hyperlink{' + href + '}' +
-                                                                '{' + contents 
+ '}');
+                                                                '{' + contents 
+ '}%');
                        });
                } else {
                        href = url.resolve(this.base, href);
@@ -496,6 +514,23 @@
        var delimit = display ? '$$' : '$';
        var eol = display ? '' : '%';
        this.output.push(delimit + math + delimit + eol);
+};
+
+Visitor.prototype['visitLANG='] = function(node) {
+       // is this a block or a span context?
+       var isBlock = 
/^(BLOCKQUOTE|BODY|CENTER|DIV|DL|FIGURE|H[1-6]|OL|P|TABLE|UL)$/.test(node.nodeName);
 // XXX others?
+       var poly = Polyglossia.lookup(this.currentLanguage);
+       if (isBlock) {
+               this.output.push('\\begin{'+poly.env+'}['+poly.options+']%');
+               var r = this.visit(node);
+               this.output.push('\\end{'+poly.env+'}%');
+               return r;
+       } else {
+               return this.collect(node, function(contents) {
+                       
this.output.push('\\text'+poly.lang+'['+poly.options+']' +
+                                                        '{' + contents + '}%');
+               });
+       }
 };
 
 Visitor.prototype['visitTYPEOF=mw:Image'] =
@@ -758,6 +793,13 @@
        if (!singleItem) {
                head = head.replace(/\]\{article\}/, ']{report}');
        }
+       // default language (for chapter headings, page numbers, etc)
+       // CLI --lang option overrides
+       var lang = options.lang || metabook.lang || 'en';
+       var poly = Polyglossia.lookup(lang);
+       head += '\n\\setdefaultlanguage[' + poly.options + ']{' + poly.lang + 
'}';
+       var usedLanguages = new Set(); // usedLanguages doesn't include default 
lang
+       head += '\n\\input{'+path.join(builddir, 'languages.tex')+'}'; // we'll 
put used languages here
        // emit title, subtitle, etc.
        var title = metabook.title;
        if (!title && metabook.items.length === 1) {
@@ -810,13 +852,15 @@
                                base: base,
                                imagemap: imagemap,
                                singleItem: singleItem,
-                               hasChapters: hasChapters
+                               hasChapters: hasChapters,
+                               lang: lang
                        });
                        var h1 = document.createElement('h1');
                        h1.textContent = item.title;
                        visitor.visit(h1); // emit document title!
                        visitor.visit(document.body);
                        var result = visitor.output.join('\n');
+                       visitor.usedLanguages.forEach(function(l){ 
usedLanguages.add(l); });
                        return P.call(fs.writeFile, fs, outfile, result, 
'utf8');
                });
        };
@@ -835,6 +879,17 @@
                return write[item.type](item);
        }).then(function() {
                return P.call(output.end, output, STD_FOOTER);
+       }).then(function() {
+               // write languages file
+               var s = '';
+               usedLanguages.forEach(function(l) {
+                       s += Polyglossia.lookup(l).lang + ',';
+               });
+               if (s) {
+                       s = '\\setotherlanguages{'+s.replace(/,$/,'')+'}\n';
+               }
+               var filename = path.join(builddir, 'languages.tex');
+               return P.call(fs.writeFile, fs, filename, s, 'utf8');
        });
 };
 
diff --git a/lib/polyglossia.js b/lib/polyglossia.js
new file mode 100644
index 0000000..a217aa6
--- /dev/null
+++ b/lib/polyglossia.js
@@ -0,0 +1,103 @@
+/** Language and option mappings for the XeLaTeX polyglossia package. */
+var table = {
+       sq: { lang: 'albanian' },
+       am: { lang: 'amharic' },
+       ar: { lang: 'arabic', env: 'Arabic' },
+       'und-Arab': { lang: 'arabic', env: 'Arabic' },
+       hy: { lang: 'armenian' },
+       ast: { lang: 'asturian' },
+       id: { lang: 'bahasai' },
+       ms: { lang: 'bahasam' },
+       eu: { lang: 'basque' },
+       bn: { lang: 'bengali' },
+       'pt-BR': { lang: 'brazil' },
+       br: { lang: 'breton' },
+       bg: { lang: 'bulgarian' },
+       ca: { lang: 'catalan' },
+       cop: { lang: 'coptic' },
+       hr: { lang: 'croatian' },
+       cs: { lang: 'czech' },
+       da: { lang: 'danish' },
+       dv: { lang: 'divehi' },
+       nl: { lang: 'dutch' },
+       en: { lang: 'english' },
+       eo: { lang: 'esperanto' },
+       et: { lang: 'estonian' },
+       fa: { lang: 'farsi' },
+       fi: { lang: 'finnish' },
+       fr: { lang: 'french' },
+       fur: { lang: 'friulan' },
+       gl: { lang: 'galician' },
+       de: { lang: 'german' },
+       el: { lang: 'greek' },
+       'el-latn': { lang: 'greek', options: 'numerals=arabic' },
+       grc: { lang: 'greek', options: 'variant=ancient' },
+       he: { lang: 'hebrew' },
+       hi: { lang: 'hindi' },
+       is: { lang: 'icelandic' },
+       ie: { lang: 'interlingua' },
+       ga: { lang: 'irish' },
+       it: { lang: 'italian' },
+       kn: { lang: 'kannada' },
+       lo: { lang: 'lao' },
+       la: { lang: 'latin' },
+       Latn: { lang: 'latin' }, // non-standard? used in arwiki sample.
+       lv: { lang: 'latvian' },
+       lt: { lang: 'lithuanian' },
+       dsb: { lang: 'lsorbian' },
+       hu: { lang: 'magyar' },
+       ml: { lang: 'malayalam' },
+       mr: { lang: 'marathi' },
+       nqo: { lang: 'nko' },
+       no: { lang: 'norsk' },
+       nn: { lang: 'nynorsk' },
+       oc: { lang: 'occitan' },
+       pmsq: { lang: 'piedmontese' },
+       pl: { lang: 'polish' },
+       pt: { lang: 'portuges' },
+       ro: { lang: 'romanian' },
+       rm: { lang: 'romansh' },
+       ru: { lang: 'russian' },
+       sme: { lang: 'samin' },
+       sa: { lang: 'sanskrit' },
+       'sa-Latn': { lang: 'sanskrit' },
+       gd: { lang: 'scottish' },
+       sr: { lang: 'serbian' },
+       sk: { lang: 'slovak' },
+       sl: { lang: 'slovenian' },
+       es: { lang: 'spanish' },
+       sv: { lang: 'swedish' },
+       syc: { lang: 'syriac' },
+       ta: { lang: 'tamil' },
+       te: { lang: 'telugu' },
+       th: { lang: 'thai' },
+       bo: { lang: 'tibetan' },
+       tr: { lang: 'turkish' },
+       tk: { lang: 'turkmen' },
+       uk: { lang: 'ukrainian' },
+       ur: { lang: 'urdu' },
+       hsb: { lang: 'usorbian' },
+       vi: { lang: 'vietnamese' },
+       cy: { lang: 'welsh' }
+       // ja = japanese
+       // ja-Hani = japanese written in Kanji
+};
+
+var lookup = function(langcode) {
+       // langcode is an RFC1766 language code.  That is, an ISO639 code,
+       // possibly followed by a dash and a variant specifier.
+       if (!table.hasOwnProperty(langcode)) {
+               console.warn('Language support not found for', langcode);
+               // try stripping the suffix.  otherwise, fall back to 'en'
+               var stripped = langcode.replace(/-[\s\S]*$/, '');
+               langcode = table.hasOwnProperty(stripped) ? stripped : 'en';
+       }
+       var r = table[langcode];
+       if (!r.env) { r.env = r.lang; }
+       if (!r.options) { r.options = ''; }
+       return r;
+};
+
+module.exports = {
+       lookup: lookup
+};

-- 
To view, visit https://gerrit.wikimedia.org/r/98560
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I08454de9c33e5971c76db2313b4bc2a48e74af1d
Gerrit-PatchSet: 4
Gerrit-Project: 
mediawiki/extensions/Collection/OfflineContentGenerator/latex_renderer
Gerrit-Branch: master
Gerrit-Owner: Cscott <canan...@wikimedia.org>
Gerrit-Reviewer: Cscott <canan...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to